Streaming from Reddit and Twitter

Please do not run the code in this section (Chapter 1), since it is meant to be run as a standalone script on zeno server in order to:

  • stream the data by using relevant API:
    • for Twitter: tweepy
    • for Reddit: PRAW which stands for Python Reddit API Wrapper
  • save it as a list of dictionaries,
  • filter only for relevant information by using only dictionary elements that we need,
  • clean it and convert to a data frame format
  • save cleaned data frame into the Postgres DB that was set up on Zeno

Streaming script for Twitter

The Class MyStreamListener was inspired by https://gist.github.com/hugobowne/18f1c0c0709ed1a52dc5bcd462ac69f4

In [ ]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import String, Integer, Float, Boolean, DateTime
import time
import tweepy
import json
engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
connection = engine.connect()

consumer_key = "Bcc9APjXSl4eH5nqJq6AqKQo0"
consumer_key_secret = "sT9VcZYL1yCnqnDzXETrU9nT09qdbmntmskq011cp2W9o8iHHh"
access_token = "988817670765273090-bKzdX7q9MOBs0ZXx6eclHxRmEWr4UCP"
access_token_secret = "9ioSZK6DWiDUi7y7y8aR2Rsr3jJUWyg2tnamrxK8cCfpx"

auth = tweepy.OAuthHandler(consumer_key, consumer_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

counter = 0
while True: # never ending loop
    counter += 1
    print("start iteration", counter)
    ## Stream tweets
    class MyStreamListener(tweepy.StreamListener):
        tweets_data = []
        def __init__(self, api=None): # initialize the class, always uses self keyword
            super(MyStreamListener, self).__init__() # super keyword makes all methods from class tweepy.StreeamListener
            # available to my extended class MyStreamListener
            # .__init__() runs the init method from tweepy.StreamListener
            self.num_tweets = 0

        def on_status(self, status):
            tweet = status._json
            self.tweets_data.append(tweet)
            self.num_tweets += 1
            if self.num_tweets < 2000: # get me 2000 tweets, 
                # later clean them and save them to Postgres DB, then sleep for 2 minutes and start over
                return True
            else:
                return False

        def on_error(self, status):
            print(status)

    # Initialize Stream listener and run all methods within
    l = MyStreamListener()

    # Create stream object with authentication
    stream = tweepy.Stream(auth, l)

    # Filter Twitter Streams to capture data by the keywords:
    stream.filter(track = ['Bitcoin', 'bitcoin', 'BitCoin', 'BTC', 'bitCoin', 
                           'BitcoinClassic', 'bitcoinclassic', 'bitcoinClassic', 'XBT',
                           'Ether', 'ether', 'ethereum', 'Ethereum', 'ETH', 'ETC',
                           'Ethereum Classic', 'EthereumClassic'])

    tweets_data = MyStreamListener().tweets_data
    
    ## CLEAN tweets_data
    text = [] # text of the tweet
    for index, value in enumerate(tweets_data):
        text.append(tweets_data[index]["text"])

    tweet_created = [] # UTC time when this Tweet was created.
    for index, value in enumerate(tweets_data):
        tweet_created.append(tweets_data[index]["created_at"])

    profile_created = [] # UTC datetime that the user account was created on Twitter.
    for index, value in enumerate(tweets_data):
        profile_created.append(tweets_data[index]["user"]["created_at"])

    user_name = [] # The name of the user, as they’ve defined it. Not necessarily a person’s name.
    for index, value in enumerate(tweets_data):
        user_name.append(tweets_data[index]["user"]["name"])

    user_screen_name = [] # The name of the user, as they’ve defined it. Not necessarily a person’s name.
    for index, value in enumerate(tweets_data):
        user_screen_name.append(tweets_data[index]["user"]["screen_name"])
        
    followers_count = []
    for index, value in enumerate(tweets_data):
        followers_count.append(tweets_data[index]["user"]["followers_count"])

    friends_count = []
    for index, value in enumerate(tweets_data):
        friends_count.append(tweets_data[index]["user"]["friends_count"])

    time_zone = [] # deprecated, but partially filled
    for index, value in enumerate(tweets_data):
        time_zone.append(tweets_data[index]["user"]["time_zone"])

    total_tweets = [] # The number of tweets_data (including retweets_data) issued by the user.
    for index, value in enumerate(tweets_data):
        total_tweets.append(tweets_data[index]["user"]["statuses_count"])

    user_location = [] # The user-defined location for this account’s profile.
    for index, value in enumerate(tweets_data):
        user_location.append(tweets_data[index]["user"]["location"])

    acc_descr = [] #  The user-defined UTF-8 string describing their account.
    for index, value in enumerate(tweets_data):
        acc_descr.append(tweets_data[index]["user"]["description"])

    verified = [] # blue verified badge on Twitter lets people know that it's an account of public interest and that it is authentic.
    for index, value in enumerate(tweets_data):
        verified.append(tweets_data[index]["user"]["verified"])

    lang = [] # language
    for index, value in enumerate(tweets_data):
        lang.append(tweets_data[index]["lang"])

    retweet_count = [] # Number of times this Tweet has been retweeted.
    for index, value in enumerate(tweets_data):
        retweet_count.append(tweets_data[index]["retweet_count"])

    retweeter_requoter = [] # follower
    for index, value in enumerate(tweets_data):
        retweeter_requoter.append(tweets_data[index]["user"]["screen_name"])

    influencer = [] # influencer = followee
    for index, value in enumerate(tweets_data):
        try:
            influencer.append(tweets_data[index]["retweeted_status"]["user"]["screen_name"])
        except:
            influencer.append("not retweeted post")

    influencer_quoted = [] # influencer = followee
    for index, value in enumerate(tweets_data):
        try:
            influencer_quoted.append(tweets_data[index]["quoted_status"]["user"]["screen_name"])
        except:
            influencer_quoted.append("not quoted post")
    
    full_text = [] # text of the tweet
    for index, value in enumerate(tweets_data):
        try:
            full_text.append(tweets_data[index]["extended_tweet"]["full_text"])
        except:
            full_text.append("not extended tweet")
    
    twitter_df = pd.DataFrame({"text":text, "tweet_created":tweet_created, "full_text":full_text,
                "profile_created":profile_created, "user_name":user_name, "followers_count":followers_count,
                "friends_count":friends_count, "time_zone":time_zone, "total_tweets":total_tweets,
                "user_location":user_location, "acc_descr":acc_descr, "verified":verified, "lang":lang,
                "retweet_count":retweet_count, "retweeter_requoter":retweeter_requoter,
                "influencer":influencer, "influencer_quoted":influencer_quoted, "user_screen_name":user_screen_name})

    twitter_df = twitter_df[twitter_df["lang"] == "en"] # exclude tweets_data in other languages
    twitter_df.drop(["lang"], inplace=True, axis=1)
    twitter_df["tweet_created"] = pd.to_datetime(twitter_df["tweet_created"], format = "%a %b %d %H:%M:%S +%f %Y")
    twitter_df["profile_created"] = pd.to_datetime(twitter_df["profile_created"], format = "%a %b %d %H:%M:%S +%f %Y")

    twitter_df.to_sql('twitter3', con=engine, if_exists='append', index=False,
                      dtype={"acc_descr":String(1000),"followers_count":Integer(), "friends_count":Integer(),
                            "influencer":String(100), "influencer_quoted":String(100), "profile_created":DateTime(),
                            "retweet_count":Integer(), "retweeter_requoter":String(100), "text":String(10000),
                            "time_zone":String(100), "total_tweets":Integer(), "tweet_created":DateTime(),
                            "user_location":String(1000), "user_name":String(1000), "verified":Boolean(), 
                             "full_text":String(10000), "user_screen_name":String(1000)})
    print(counter, "iteration is completed")
    time.sleep(900)

Streaming script for Reddit

In [ ]:
import pandas as pd
import praw # import  Python Reddit API Wrapper
from sqlalchemy import create_engine
from sqlalchemy.types import String, Integer, Float, Boolean, DateTime
import time
#import tweepy
#_____________________________________________________________________________________________________
# Stream the data from reddit and show the sentiment over time + sentiment comparison bw. BTC and ETH
#_____________________________________________________________________________________________________
client_id = "2BKbi2rOzcWy5w"
secret = "GOa9xfMkea62qn6U7yHdVonrF-g"
reddit = praw.Reddit(client_id = client_id, client_secret = secret, password = "************",
                     user_agent='praw_test', username='***********') # Reddit instance

engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
connection = engine.connect()

import re
from textblob import TextBlob
def get_sentiment(sentence):
    analysis = TextBlob(sentence)
    return(analysis.sentiment.polarity) # > 0 positive, < 0 negative

def get_grouped_sentiment(sentence):
    '''function to classify sentiment of passed SA score'''
    analysis = TextBlob(sentence)
    score = analysis.sentiment.polarity
    if score > 0.05:
        return 'positive'
    elif score <= 0.05 and score > -0.005:
        return 'neutral'
    else:
        return 'negative'

print("start")
counter = 0
while True:
    counter += 1
    print("start iteration", counter)
    list_of_items = []
    fields = ('title', 'created_utc', 'num_comments', 'ups', 'downs')
    # fields attribute has the relevant "key" names that we want from each submission
    # we deliberately dispose of selftext (long form), comments and url for consistency bw. reddits,
    # since only some of them have selftext and the comments are nested, making their storage in a single table impossible
    for submission in reddit.subreddit('CryptoCurrency').new(limit=None):
        to_dict = vars(submission) # vars returns a dictionary with the attributes of the object.
        sub_dict = {field:to_dict[field] for field in fields} # we grab specific values such as title, ups/downs from the dictionary
        list_of_items.append(sub_dict) # to_dict is just a variable that is the dictionary form of each submission

    ## clean the reddits
    reddit_df = pd.DataFrame(list_of_items)
    reddit_df["created_utc"] = pd.to_datetime(reddit_df["created_utc"], unit='s') # clean the date format: up to seconds, without miliseconds
    #engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
    #connection = engine.connect()

    # Regexp: split df into BTC & ETH
    titles = reddit_df.title.tolist()
    btc_titles = [i for i in titles if \
              len(re.findall(r"(bitcoin|Bitcoin|BTC|BitCoin|bitCoin|BitcoinClassic|Bitcoin Classic|bitcoinclassic|bitcoinClassic|XBT)", i)) > 0]

    eth_titles = [i for i in titles if \
              len(re.findall(r"(ethereum|Ethereum|ETH|ETC|Ethereum Classic|EthereumClassic|ether|eth)", i)) > 0]

    btc_SA_score = [get_sentiment(sentence) for sentence in btc_titles]
    btc_SA_score_grouped = [get_grouped_sentiment(sentence) for sentence in btc_titles]
    btc_reddit = pd.DataFrame({"title":btc_titles, "SA_score":btc_SA_score, "SA_score_grouped":btc_SA_score_grouped})
    btc_reddit = pd.merge(btc_reddit, reddit_df, on="title", how="inner")

    eth_SA_score = [get_sentiment(sentence) for sentence in eth_titles]
    eth_SA_score_grouped = [get_grouped_sentiment(sentence) for sentence in eth_titles]
    eth_reddit = pd.DataFrame({"title":eth_titles, "SA_score":eth_SA_score, "SA_score_grouped":eth_SA_score_grouped})
    eth_reddit = pd.merge(eth_reddit, reddit_df, on="title", how="inner")
    # replace btc_df with btc_reddit in all scripts

    btc_reddit.to_sql('btc_reddit', con=engine, if_exists='append', index=False, \
                     dtype={"created_utc": DateTime(), "downs":Integer(), "ups":Integer(), \
                           "num_comments":Integer(), "title":String(10000), "SA_score":Float(), "SA_score_grouped":String(20)})

    eth_reddit.to_sql('eth_reddit', con=engine, if_exists='append', index=False, \
                     dtype={"created_utc": DateTime(), "downs":Integer(), "ups":Integer(), \
                           "num_comments":Integer(), "title":String(10000), "SA_score":Float(), "SA_score_grouped":String(20)})

    print(counter, " iteration is completed.")
    time.sleep(600)

Entire dashboard script

The following script produces an HTML file that runs on localhost by using Flask server on port 8050: http://127.0.0.1:8050/

In [ ]:
import dash
import dash_auth
import dash_core_components as dcc
import dash_html_components as html
import dash_table_experiments as dt
import plotly.graph_objs as go
import re
import numpy as np
import pandas as pd
import requests
import json
from textblob import TextBlob
from sqlalchemy import create_engine
from sqlalchemy.types import String, Integer, Float, Boolean, DateTime
# from dashapp import server as application
app = dash.Dash(__name__)

# if you want to implement USER and PASSWORD, please uncomment the following 3 lines of code:
# from dash.dependencies import Input, Output
# USERNAME_PASSWORD_PAIRS = [['bipm', 'crypto']]
# auth = dash_auth.BasicAuth(app,USERNAME_PASSWORD_PAIRS)


#_____________________________________________________________________________________________________
# Get the streamed data from REDDIT and show the sentiment over time + aggregated sentiment + comparison bw. BTC and ETH
#_____________________________________________________________________________________________________
# REDDIT preprocessing
#_____________________________________________________________________________________________________

engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
connection = engine.connect()
btc_reddit = pd.read_sql(sql = "select distinct title, created_utc, \"SA_score_grouped\", \"SA_score\" from btc_reddit order by created_utc desc", con = connection, index_col=None)
eth_reddit = pd.read_sql(sql = "select distinct title, created_utc, \"SA_score_grouped\", \"SA_score\" from eth_reddit order by created_utc desc", con = connection, index_col=None)

# to later display the interactive data table
reddit = pd.concat([btc_reddit, eth_reddit], axis=0, join='outer', # to get UNION of rows, instead of intersection
          join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
reddit.SA_score = round(reddit.SA_score,2)

# Aggregated sentiment on Reddit
btc_grouped = btc_reddit[["SA_score", "SA_score_grouped"]].groupby("SA_score_grouped").count()
btc_grouped["sentiment"] = btc_grouped.index
btc_grouped.reset_index(drop=True, inplace=True)
btc_grouped.rename(columns={"SA_score": "nr_of_tweets"}, inplace=True)

eth_grouped = eth_reddit[["SA_score", "SA_score_grouped"]].groupby("SA_score_grouped").count()
eth_grouped["sentiment"] = eth_grouped.index
eth_grouped.reset_index(drop=True, inplace=True)
eth_grouped.rename(columns={"SA_score": "nr_of_tweets"}, inplace=True)

#_____________________________________________________________________________________________________
# BTC and ETH values over time - Preprocessing
#_____________________________________________________________________________________________________

url = 'https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=365'
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
btc_values_df = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
btc_values_df["timestamp"] = pd.to_datetime(btc_values_df["time"], unit='s') # clean the date format: up to seconds, without miliseconds

url = 'https://min-api.cryptocompare.com/data/histoday?fsym=ETH&tsym=USD&limit=365'
r = requests.get(url)
json_data = r.json() # Decode the JSON data into a dictionary: json_data
eth_values_df = pd.DataFrame(json_data["Data"])
eth_values_df["timestamp"] = pd.to_datetime(eth_values_df["time"], unit='s') # clean the date format: converts the unix timestamp to pandas date data type

#_____________________________________________________________________________________________________
# Get the streamed data from TWITTER and show the sentiment over time + aggregated sentiment + comparison bw. BTC and ETH
#_____________________________________________________________________________________________________
# Twitter preprocessing
#_____________________________________________________________________________________________________

twitter_df = pd.read_sql(sql = "select distinct text, tweet_created from twitter3 TABLESAMPLE SYSTEM(1) where text ~* '(btc|#eth|ether|bitcoin|ethereum)' order by tweet_created desc", con = connection, index_col=None)

list_of_tweets = twitter_df.text.tolist()

# even though the data has been cleaned directly throgh SQL query, we use RegExp to separate tweets related to BTC and ETH
eth_tweets = [tweet for tweet in list_of_tweets if \
              len(re.findall(r"(ethereum|Ethereum|ETH|ETC|Ethereum Classic|EthereumClassic|ether|eth)", tweet)) > 0]

btc_tweets = [tweet for tweet in list_of_tweets if \
              len(re.findall(r"(bitcoin|Bitcoin|BTC|BitCoin|bitCoin|BitcoinClassic|Bitcoin Classic|bitcoinclassic|bitcoinClassic|XBT)", tweet)) > 0]

# Twitter Sentiment analysis
def get_sentiment(sentence):
    analysis = TextBlob(sentence)
    return(round(analysis.sentiment.polarity, 2)) # > 0 positive, < 0 negative

btc_twitter_sa = [get_sentiment(sentence) for sentence in btc_tweets]
twitter_btc_df = pd.DataFrame({"text":btc_tweets, "SA_score":btc_twitter_sa})
twitter_btc_df = pd.merge(twitter_btc_df, twitter_df, how='inner', on="text")

eth_twitter_sa = [get_sentiment(sentence) for sentence in eth_tweets]
twitter_eth_df = pd.DataFrame({"text":eth_tweets, "SA_score":eth_twitter_sa})
twitter_eth_df = pd.merge(twitter_eth_df, twitter_df, how="inner", on="text")

# to later display the interactive data table
twitter = pd.concat([twitter_btc_df, twitter_eth_df], axis=0, join='outer', # to get UNION of rows, instead of intersection
          join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

#  Remove duplicates for detailed sentiment plot
twitter_btc_df2 = twitter_btc_df.drop_duplicates(subset = ["text"], keep='first', inplace=False)
twitter_eth_df2 = twitter_eth_df.drop_duplicates(subset = ["text"], keep='first', inplace=False)

# Function for aggregated sentiment
def get_short_sentiment(sentence):
    '''function to classify sentiment of passed SA score'''
    if sentence > 0.05:
        return 'positive'
    elif sentence <= 0.05 and sentence > -0.005:
        return 'neutral'
    else:
        return 'negative'

short_twitter_btc = [get_short_sentiment(t) for t in twitter_btc_df.SA_score] # twitter_btc
short_twitter_eth = [get_short_sentiment(t) for t in twitter_eth_df.SA_score] # twitter_eth

twitter_btc_grouped = pd.DataFrame({"nr_of_tweets":twitter_btc_df.text, "short":short_twitter_btc}).groupby("short")
twitter_eth_grouped = pd.DataFrame({"nr_of_tweets":twitter_eth_df.text, "short":short_twitter_eth}).groupby("short")

twitter_btc_grouped = twitter_btc_grouped.count()
twitter_eth_grouped = twitter_eth_grouped.count()

twitter_btc_grouped["sentiment"] = twitter_btc_grouped.index
twitter_eth_grouped["sentiment"] = twitter_eth_grouped.index

twitter_btc_grouped.reset_index(drop=True, inplace=True)
twitter_eth_grouped.reset_index(drop=True, inplace=True)

#______________________________________________________________________________
# News preprocessing
#_____________________________________________________________________________________________________

ccn = pd.read_sql(sql = "select distinct article, date from ccn_articles order by date desc",
                         con = connection, index_col=None)

ccn_sa = [get_sentiment(sentence) for sentence in ccn.article]
ccn_sa_df = pd.DataFrame({"article":ccn.article, "SA_score":ccn_sa})
ccn_df = pd.merge(ccn_sa_df, ccn, how='inner', on="article")

short_ccn = [get_short_sentiment(t) for t in ccn_df.SA_score] # twitter_btc
ccn_grouped = pd.DataFrame({"nr_of_articles":ccn_df.article, "short":short_ccn}).groupby("short").count()
ccn_grouped["sentiment"] = ccn_grouped.index
ccn_grouped.reset_index(drop=True, inplace=True)

#______________________________________________________________________________
# Simple BoW model
#______________________________________________________________________________

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize

n = 10
def generate_word_list(text_col, nr_words = n):
    tokens = word_tokenize(text_col.to_string()) # tokenize
    lower_tokens = [t.lower() for t in tokens] # Convert the tokens into lowercase: lower_tokens
    alpha_only = [t for t in lower_tokens if t.isalpha()] # Retain alphabetic words: alpha_only
    stopwords = nltk.corpus.stopwords.words('english') # Remove all stop words: no_stops
    newStopWords = ["rt", "bitcoin", "crypto", "cryptocurrency", "blockchain", "blockcha", "btc", "bitcoi", "bitcoins", "daily", "say", "could",
                   "price", "ethereum", "eth", "classic", "exchange", "market", "cryptocurrencie", "one", "first", "short", "check",
                   "cryptocurrencies", "http", "htttp", "hour", "list", "u", "new", "vi", "ccn", "etc", "usd"]
    stopwords.extend(newStopWords)
    no_stops = [t for t in alpha_only if t not in stopwords]
    wordnet_lemmatizer = WordNetLemmatizer() # create instance of the WordNetLemmatizer class
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops if len(t)>1] # Lemmatize all tokens into a new list
    lemmatized = [t for t in lemmatized if t not in stopwords] # remove stopwords again after lemmatization
    bow = Counter(lemmatized) # Create the bag-of-words: bow
    word = []
    word_count = []
    for i in range(nr_words):
        word.append(bow.most_common(nr_words)[i][0])
        word_count.append(bow.most_common(nr_words)[i][1])
    words_and_counts_df = pd.DataFrame({"word":word, "word_count":word_count})
    return(words_and_counts_df) # return the n most common tokens

#______________________________________________________________________________
# Aggregate Sentiment by day
#______________________________________________________________________________
# Reddit
minDate = btc_reddit["created_utc"].min()
maxDate = btc_reddit["created_utc"].max()
ts_btc_reddit = btc_reddit.set_index("created_utc", inplace=False)
ts_btc_reddit = ts_btc_reddit.SA_score.resample('D').mean()
ts_eth_reddit = eth_reddit.set_index("created_utc", inplace=False)
ts_eth_reddit = ts_eth_reddit.SA_score.resample('D').mean()
standardized_reddit_scores = pd.DataFrame({'BTC':ts_btc_reddit,'ETH':ts_eth_reddit})
# Since the server might be down on certain days, we need to ensure that our time series has no discontinuities: interpolate() fills gaps of any size with a straight line
standardized_reddit_scores['BTC'].interpolate(method='linear', inplace=True)
standardized_reddit_scores['ETH'].interpolate(method='linear', inplace=True)

# Twitter
ts_twitter_btc_df = twitter_btc_df.set_index("tweet_created", inplace=False)
ts_twitter_btc_df = ts_twitter_btc_df.SA_score.resample('D').mean()
ts_twitter_eth_df = twitter_eth_df.set_index("tweet_created", inplace=False)
ts_twitter_eth_df = ts_twitter_eth_df.SA_score.resample('D').mean()
standardized_twitter_scores = pd.DataFrame({'BTC':ts_twitter_btc_df, 'ETH':ts_twitter_eth_df})
# Since the server might be down on certain days, we need to ensure that time series has no discontinuities: interpolate() fills gaps of any size with a straight line
standardized_twitter_scores['BTC'].interpolate(method='linear', inplace=True)
standardized_twitter_scores['ETH'].interpolate(method='linear', inplace=True)

#  News
ts_ccn = ccn_df.set_index("date", inplace=False)
ts_ccn = ts_ccn.SA_score.resample('D').mean()
standardized_ccn_scores = pd.DataFrame({'CCN':ts_ccn})

#______________________________________________________________________________
# Preprocessing for aggregated plot:
#______________________________________________________________________________
# BTC
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=20' # + str(len(standardized_reddit_scores))
# we want only last few days--> &toTs=1522224000
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
btc_mini = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
btc_mini["timestamp"] = pd.to_datetime(btc_mini["time"], unit='s')

# ETH
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=ETH&tsym=USD&limit=20' # we want only last few days
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
eth_mini = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
eth_mini["timestamp"] = pd.to_datetime(eth_mini["time"], unit='s')

btc_mini.set_index("timestamp", inplace=True)
eth_mini.set_index("timestamp", inplace=True)

# Now we scale the "Mini" BTC/ETH values so that we can plot them together with sentiment on the same axis.
# we scale values to be between -1 and 1, i.e. on the same scale as the sentiment values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
btc_scaled = pd.DataFrame(scaler.fit_transform(btc_mini), columns=btc_mini.columns)
eth_scaled = pd.DataFrame(scaler.fit_transform(eth_mini), columns=eth_mini.columns)

#______________________________________________________________________________
### Preprocessing for the map
#______________________________________________________________________________
#get the store name that can support crytocurrency
mapbox_access_token = 'pk.eyJ1Ijoic2FyYXB1dHJpIiwiYSI6ImNqaTMzNDBuaTB2djgzdm9hZXlnMTl1cW4ifQ.8vsxIGidl6bUz-u_rK3YSQ'
url = 'https://coinmap.org/api/v1/venues/'
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
venues_df = pd.DataFrame(json_data["venues"]) # dictionary of currency values is stored as a list under the key "Data"
#btc_values_df["timestamp"] = pd.to_datetime(btc_values_df["time"], unit='s') # clean the date format: up to seconds, without miliseconds
#venues_df.info()
site_lat = venues_df["lat"]
site_lon = venues_df["lon"]
locations_name = venues_df["name"]
category_name = venues_df["category"]
#get the address data from google maps from latitude and longitude

#______________________________________________________________________________
# Preprocessing for the NER = Named Entity Recognition
#______________________________________________________________________________
import pickle
with open('Pickle_dash/ccn_df_tagged.pkl', 'rb') as f:
    ccn_df_tagged = pickle.load(f)

with open('Pickle_dash/df_agg_count_ccn.pkl', 'rb') as f:
    df_agg_count_ccn = pickle.load(f)

with open('Pickle_dash/df_btc_reddit_org_agg.pkl', 'rb') as f:
    df_btc_reddit_org_agg = pickle.load(f)

with open('Pickle_dash/df_btc_reddit_pep_agg.pkl', 'rb') as f:
    df_btc_reddit_pep_agg = pickle.load(f)

with open('Pickle_dash/df_tweet_agg_sum.pkl', 'rb') as f:
    df_tweet_agg_sum = pickle.load(f)

with open('Pickle_dash/df_tweet_btc_agg_sum.pkl', 'rb') as f:
    df_tweet_btc_agg_sum = pickle.load(f)

with open('Pickle_dash/df_tweet_eth_agg_sum.pkl', 'rb') as f:
    df_tweet_eth_agg_sum = pickle.load(f)

with open('Pickle_dash/df_tweet_pep_agg.pkl', 'rb') as f:
    df_tweet_pep_agg = pickle.load(f)

with open('Pickle_dash/df_tweet_pep_btc_agg.pkl', 'rb') as f:
    df_tweet_pep_btc_agg = pickle.load(f)

with open('Pickle_dash/df_tweet_pep_eth_agg.pkl', 'rb') as f:
    df_tweet_pep_eth_agg = pickle.load(f)

with open('Pickle_dash/df_eth_reddit_org_agg.pkl', 'rb') as f:
    df_eth_reddit_org_agg = pickle.load(f)

with open('Pickle_dash/df_eth_reddit_pep_agg.pkl', 'rb') as f:
    df_eth_reddit_pep_agg = pickle.load(f)

#_____________________________________________________________________________________________________
# Define the app layout incl. all plots
#_____________________________________________________________________________________________________
app.layout = html.Div([html.H1('This dashboard shows current trends about Bitcoin and Ethereum in order to help you to make an informed decision for your investment',
                        id='h1-element'),
                html.H3("Is the sentiment in the News and Social Media connected to the price developments over time? Let's have a look!"),
                            dcc.Graph(id='barplot5',
                                figure = {'data':[
                                go.Scatter(
                                x = btc_mini.index,
                                y = btc_scaled.close,
                                name = "BTC in USD (scaled)",
                                visible=True,
                                marker=dict(color='#f2a900'),
                                mode = 'markers+lines'
                                ),
                                go.Scatter(
                                x = eth_mini.index,
                                y = eth_scaled.close,
                                name = "ETH in USD (scaled)",
                                visible=True,
                                marker=dict(color='#4d4d4e'),
                                mode = 'markers+lines'
                                ),
                                go.Scatter(
                                x = standardized_reddit_scores.index,
                                y = standardized_reddit_scores.BTC,
                                line = dict(color = '#f2a900', dash = 'dot'),
                                name = "BTC Sentiment on Reddit",
                                visible=True,
                                #marker=dict(color='green'),
                                mode = 'markers+lines'
                                ),
                                go.Scatter(
                                x = standardized_reddit_scores.index,
                                y = standardized_reddit_scores.ETH,
                                    line = dict(color = '#4d4d4e', dash = 'dot'),
                                name = "ETH Sentiment on Reddit",
                                visible=True,
                                #marker=dict(color='blue'),
                                mode = 'markers+lines'
                                ),
                                go.Scatter(
                                x = standardized_ccn_scores.index,
                                y = standardized_ccn_scores.CCN,
                                line = dict(color = 'green', dash = 'dash'),
                                name = "BTC and ETH Sentiment in the News",
                                visible=True,
                                mode = 'markers+lines'
                                ),
                                go.Scatter(
                                x = standardized_twitter_scores.index,
                                y = standardized_twitter_scores.BTC,
                                line = dict(color = 'blue', dash = 'solid'),
                                name = "BTC Sentiment on Twitter",
                                visible=True,
                                #marker=dict(color='green'),
                                mode = 'markers+lines'
                                ),
                                go.Scatter(
                                x = standardized_twitter_scores.index,
                                y = standardized_twitter_scores.ETH,
                                    line = dict(color = 'purple', dash = 'solid'),
                                name = "ETH Sentiment on Twitter",
                                visible=True,
                                #marker=dict(color='blue'),
                                mode = 'markers+lines'
                                )],
                                'layout':go.Layout(title = 'BTC and ETH values & sentiment', showlegend=True,
                                                    updatemenus = list([
                                                        dict(active=-1, buttons=list([
                                                                dict(label = 'BTC and ETH Values over time',
                                                                     method = 'update',
                                                                     args = [{'visible': [True, True, False, False, False, False, False]},
                                                                             {'title': 'BTC and ETH values'}]),
                                                                dict(label = 'BTC and ETH Sentiment on Reddit',
                                                                     method = 'update',
                                                                     args = [{'visible': [False, False, True, True, False, False, False]},
                                                                             {'title': 'BTC and ETH sentiment on Reddit'}]),
                                                                dict(label = 'News',
                                                                     method = 'update',
                                                                     args = [{'visible': [False, False, False, False, True, False, False]},
                                                                             {'title': 'BTC and ETH sentiment in the News'}]),
                                                                dict(label = 'Twitter BTC & ETH',
                                                                     method = 'update',
                                                                     args = [{'visible': [False, False, False, False, False, True, True]},
                                                                             {'title': 'BTC and ETH sentiment on Twitter'}]),
                                                                dict(label = 'Reset: show all',
                                                                     method = 'update',
                                                                     args = [{'visible': [True, True, True, True, True, True, True]},
                                                                             {'title': 'BTC and ETH values & sentiment in the News and Social Media'}])
                                                            ])
                                                        )
                                                    ])
                                                    ,
                                                    xaxis = dict(title = 'Time', range = [minDate, maxDate]),
                                                    yaxis = dict(title = 'Sentiment & Values over time')
                                            )}),
                        html.P("In this dashboard, you can analyze the sentiment on social media and in the news regarding the two most popular cryptocurrencies: Bitcoin (BTC) and Ethereum (ETH).\n \
                        You can choose the source you are interested in by selecting from the dropdown-menu on the left. \
                        The sentiment score on the Y axis is a value between -1, denoting a strong negative sentiment, and 1, very positive sentiment."),
                    dcc.Graph(id='scatterplot1',
                    figure = {'data':[
                            go.Scatter(
                            x = btc_reddit.created_utc,
                            y = btc_reddit.SA_score,
                            name = "BTC Sentiment on Reddit",
                            visible=True,
                            marker=dict(color='#f2a900'),
                            mode = 'markers+lines'
                            ),
                            go.Scatter(
                            x = eth_reddit.created_utc,
                            y = eth_reddit.SA_score,
                            name = "ETH Sentiment on Reddit",
                            visible=True,
                            marker=dict(color='#4d4d4e'),
                            mode = 'markers+lines'
                            ),
                            go.Scatter(
                            x = twitter_btc_df2.tweet_created,
                            y = twitter_btc_df2.SA_score,
                            name = "BTC Sentiment on Twitter",
                            visible=False,
                            marker=dict(color='#f2a900'),
                            mode = 'markers+lines'
                            ),
                            go.Scatter(
                            x = twitter_eth_df2.tweet_created,
                            y = twitter_eth_df2.SA_score,
                            name = "ETH Sentiment on Twitter",
                            visible=False,
                            marker=dict(color='#4d4d4e'),
                            mode = 'markers+lines'
                            ),
                            go.Scatter(
                            x = ccn_df.date[np.logical_and(ccn_df.date >= minDate, ccn_df.date <= maxDate)],
                            y = ccn_df.SA_score[np.logical_and(ccn_df.date >= minDate, ccn_df.date <= maxDate)],
                            name = "BTC and ETH Sentiment in the News",
                            visible=False,
                            marker=dict(color='#4d4d4e'),
                            mode = 'markers+lines'
                            )
                    ],
                            'layout':go.Layout(title = 'BTC and ETH sentiment over time', showlegend=True,
                                                updatemenus = list([
                                                    dict(active=-1,
                                                         buttons=list([
                                                            dict(label = 'BTC Sentiment on Reddit',
                                                                 method = 'update',
                                                                 args = [{'visible': [True, False, False, False, False]},
                                                                         {'title': 'BTC sentiment over time on Reddit'}]),
                                                            dict(label = 'ETH Sentiment on Reddit',
                                                                 method = 'update',
                                                                 args = [{'visible': [False, True, False, False, False]},
                                                                         {'title': 'ETH sentiment over time on Reddit'}]),
                                                            dict(label = 'Both: Sentiment on Reddit',
                                                                 method = 'update',
                                                                 args = [{'visible': [True, True, False, False, False]},
                                                                         {'title': 'BTC and ETH sentiment over time on Reddit'}]),
                                                            dict(label = 'BTC Sentiment on Twitter',
                                                                 method = 'update',
                                                                 args = [{'visible': [False, False, True, False, False]},
                                                                         {'title': 'BTC sentiment over time on Twitter'}]),
                                                            dict(label = 'ETH Sentiment on Twitter',
                                                                 method = 'update',
                                                                 args = [{'visible': [False, False, False, True, False]},
                                                                         {'title': 'ETH sentiment over time on Twitter'}]),
                                                            dict(label = 'Both: Sentiment on Twitter',
                                                                 method = 'update',
                                                                 args = [{'visible': [False, False, True, True, False]},
                                                                         {'title': 'BTC and ETH sentiment over time on Twitter'}]),
                                                            dict(label = 'BTC & ETH Sentiment in the News',
                                                                 method = 'update',
                                                                 args = [{'visible': [False, False, False, False, True]},
                                                                         {'title': 'BTC and ETH Sentiment in the News'}])
                                                        ]),
                                                    )
                                                ])
                                                ,
                                                xaxis = dict(title = 'Time'), #, range = [minDate, maxDate]),
                                                yaxis = dict(title = 'Sentiment')
                                        )}
                                        ),
# Sentiment grouped
                    dcc.Graph(id='pie2',
                    figure = {'data':[
                        go.Pie(
                            labels=btc_grouped.sentiment,
                            values=btc_grouped.nr_of_tweets,
                            name = 'BTC Sentiment on Reddit',
                            visible=True,
                            marker=dict(colors=['#fc586e', '#fffaaa', '#87d686']) # set the colors to red, yellow and green for pie chart
                        ),
                        go.Pie(
                            labels=eth_grouped.sentiment,
                            values=eth_grouped.nr_of_tweets,
                            name = 'ETH Sentiment on Reddit',
                            visible=False,
                            marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
                        ),
                        go.Pie(
                            labels=twitter_btc_grouped.sentiment,
                            values=twitter_btc_grouped.nr_of_tweets,
                            name = 'BTC Sentiment on Twitter',
                            visible=False,
                            marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
                        ),
                        go.Pie(
                            labels=twitter_eth_grouped.sentiment,
                            values=twitter_eth_grouped.nr_of_tweets,
                            name = 'ETH Sentiment on Twitter',
                            visible=False,
                            marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
                        ),
                        go.Pie(
                            labels=ccn_grouped.sentiment,
                            values=ccn_grouped.nr_of_articles,
                            name = 'BTC and ETH Sentiment in the News',
                            visible=False,
                            marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
                        )
                        ],
                        'layout':go.Layout(title = 'BTC sentiment on Reddit', showlegend=True,
                                            updatemenus = list([
                                                dict(active=-1,
                                                     buttons=list([
                                                        dict(label = 'BTC sentiment on Reddit',
                                                             method = 'update',
                                                             args = [{'visible': [True, False, False, False, False]},
                                                                     {'title': 'BTC sentiment on Reddit'}]),
                                                        dict(label = 'ETH sentiment on Reddit',
                                                             method = 'update',
                                                             args = [{'visible': [False, True, False, False, False]},
                                                                     {'title': 'ETH sentiment on Reddit'}]),
                                                         dict(label = 'BTC sentiment on Twitter',
                                                              method = 'update',
                                                              args = [{'visible': [False, False, True, False, False]},
                                                                      {'title': 'BTC sentiment on Twitter'}]),
                                                         dict(label = 'ETH sentiment on Twitter',
                                                              method = 'update',
                                                              args = [{'visible': [False, False, False, True, False]},
                                                                      {'title': 'ETH sentiment on Twitter'}]),
                                                        dict(label = 'BTC & ETH Sentiment in the News',
                                                              method = 'update',
                                                              args = [{'visible': [False, False, False, False, True]},
                                                                      {'title': 'BTC and ETH Sentiment in the News'}])
                                                    ]),
                                                )
                                            ])
                                        )}
                                        ),
                html.H2("Where can we actually pay by using Bitcoin and Ethereum? Let's find out."),
                            dcc.Graph(id='map5',
                                figure = {'data':[
                                    go.Scattermapbox(
                                        lat=site_lat,
                                        lon=site_lon,
                                        mode='markers',
                                        marker=dict(
                                            size=9
                                        ),
                                        text='store name: ' +locations_name+ ', ' + 'category: ' +category_name,
                                        hoverinfo="text"
                                    )
                                ],
                                'layout':go.Layout(autosize=True,
                                                    hovermode='closest',
                                                    mapbox=dict(
                                                        accesstoken=mapbox_access_token,
                                                        bearing=0,
                                                        center=dict(lat=52.52,lon=13.4050),
                                                        pitch=0,
                                                        zoom=10
                                                    ))}),
        html.H2("Which organizations and people are currently mentioned in the News and Social Media with respect to Bitcoin and Ethereum?"),
        dcc.Graph(id='organizations',
        figure = {'data':[
                go.Bar(
                x = df_tweet_agg_sum['Number'].head(15).sort_index(ascending=False),
                y = df_tweet_agg_sum['Organization'].head(15).sort_index(ascending=False),
                name = "Twitter (BTC & ETH)",
                visible=True,
                orientation = 'h'
                ),
                go.Bar(
                x = df_tweet_btc_agg_sum['Number'].head(15).sort_index(ascending=False),
                y = df_tweet_btc_agg_sum['Organization'].head(15).sort_index(ascending=False),
                name = "Twitter (BTC)",
                visible=True,
                orientation = 'h'
                ),
                go.Bar(
                x = df_tweet_eth_agg_sum['Number'].head(15).sort_index(ascending=False),
                y = df_tweet_eth_agg_sum['Organization'].head(15).sort_index(ascending=False),
                name = "Twitter (ETH)",
                visible=True,
                orientation = 'h'
                ),
                go.Bar(
                x = df_btc_reddit_org_agg['Number'].head(5).sort_index(ascending=False),
                y = df_btc_reddit_org_agg['Organization'].head(5).sort_index(ascending=False),
                name = "Reddit (BTC)",
                visible=True,
                orientation = 'h',
                marker=dict(
                color='rgb(231, 60, 0)')
                ),
                go.Bar(
                x = df_eth_reddit_org_agg['Number'].head(5).sort_index(ascending=False),
                y = df_eth_reddit_org_agg['Organization'].head(5).sort_index(ascending=False),
                name = "Reddit (ETH)",
                visible=True,
                orientation = 'h',
                marker=dict(
                color='rgb(231, 60, 0)')
                ),
                go.Bar(
                x = df_agg_count_ccn['Sum'].head(15).sort_index(ascending=False),
                y = df_agg_count_ccn['Organization'].head(15).sort_index(ascending=False),
                name = "CCN news",
                visible=True,
                orientation = 'h',
                marker=dict(
                color='rgb(231, 118, 0)')
                ),

        ],
                'layout':go.Layout(title = 'Organizations on Twitter, Reddit & CCN news', showlegend=True,
                                    updatemenus = list([
                                        dict(active=-1,
                                             buttons=list([
                                                dict(label = 'Twitter (BTC & ETH)',
                                                     method = 'update',
                                                     args = [{'visible': [True, False, False, False, False, False]},
                                                             {'title': 'TOP 15 organizations on Twitter (BTC & ETH)'}]),
                                                dict(label = 'Twitter (BTC)',
                                                     method = 'update',
                                                     args = [{'visible': [False, True, False, False, False, False]},
                                                             {'title': 'TOP 15 organizations on Twitter (BTC)'}]),
                                                dict(label = 'Twitter (ETH)',
                                                     method = 'update',
                                                     args = [{'visible': [False, False, True, False, False, False]},
                                                             {'title': 'TOP 15 organizations on Twitter (ETH)'}]),
                                                dict(label = 'Reddit (BTC)',
                                                     method = 'update',
                                                     args = [{'visible': [False, False, False, True, False, False]},
                                                             {'title': 'TOP 5 organizations on Reddit (BTC)'}]),

                                                dict(label = 'Reddit (ETH)',
                                                     method = 'update',
                                                     args = [{'visible': [False, False, False, False, True, False]},
                                                             {'title': 'TOP 5 organizations on Reddit (ETH)'}]),
                                                dict(label = 'CCN news',
                                                     method = 'update',
                                                     args = [{'visible': [False, False, False, False, False, True]},
                                                             {'title': 'TOP 15 organizations on CCN news'}]),
                                                dict(label = 'Reset: show all',
                                                     method = 'update',
                                                     args = [{'visible': [True, True, True, True, True, True]},
                                                             {'title': 'TOP 15 organizations on CCN news'}])
                                            ]),

                                            direction = 'down',
                                            pad = {'r': 10, 't': 10},
                                            showactive = True,
                                            x = 0,
                                            xanchor = 'right',
                                            y = 1.2,
                                            yanchor = 'top'
                                        )
                                    ])
                                    ,
                                    xaxis = dict(title = 'Number of occurrences'),
                                    yaxis = dict(title = ''),
                                    margin=dict(
                l=250,
                r=20,
                t=70,
                b=70,
            )

                            )}
        ),
        dcc.Graph(id='people',
            figure = {'data':[
                    go.Bar(
                    x = df_tweet_pep_agg['Number'].head(15).sort_index(ascending=False),
                    y = df_tweet_pep_agg['Person'].head(15).sort_index(ascending=False),
                    name = "Twitter (BTC & ETH)",
                    visible=True,
                    orientation = 'h'
                    ),
                    go.Bar(
                    x = df_tweet_pep_btc_agg['Number'].head(15).sort_index(ascending=False),
                    y = df_tweet_pep_btc_agg['Person'].head(15).sort_index(ascending=False),
                    name = "Twitter (BTC)",
                    visible=True,
                    orientation = 'h'
                    ),
                    go.Bar(
                    x = df_tweet_pep_eth_agg['Number'].head(15).sort_index(ascending=False),
                    y = df_tweet_pep_eth_agg['Person'].head(15).sort_index(ascending=False),
                    name = "Twitter (ETH)",
                    visible=True,
                    orientation = 'h'
                    ),
                    go.Bar(
                    x = df_btc_reddit_pep_agg['Number'].head(5).sort_index(ascending=False),
                    y = df_btc_reddit_pep_agg['Person'].head(5).sort_index(ascending=False),
                    name = "Reddit (BTC)",
                    visible=True,
                    orientation = 'h',
                    marker=dict(
                    color='rgb(231, 60, 0)')
                    ),
                    go.Bar(
                    x = df_eth_reddit_pep_agg['Number'].head(5).sort_index(ascending=False),
                    y = df_eth_reddit_pep_agg['Person'].head(5).sort_index(ascending=False),
                    name = "Reddit (ETH)",
                    visible=True,
                    orientation = 'h',
                    marker=dict(
                    color='rgb(231, 60, 0)')
                    ),
            ],
                    'layout':go.Layout(title = 'TOP people discussed on Twitter & Reddit', showlegend=True,
                                        updatemenus = list([
                                            dict(active=-1,
                                                 buttons=list([
                                                    dict(label = 'Twitter (BTC & ETH)',
                                                         method = 'update',
                                                         args = [{'visible': [True, False, False, False, False]},
                                                                 {'title': 'TOP 15 people on Twitter (BTC & ETH)'}]),
                                                    dict(label = 'Twitter (BTC)',
                                                         method = 'update',
                                                         args = [{'visible': [False, True, False, False, False]},
                                                                 {'title': 'TOP 15 people on Twitter (BTC)'}]),
                                                    dict(label = 'Twitter (ETH)',
                                                         method = 'update',
                                                         args = [{'visible': [False, False, True, False, False]},
                                                                 {'title': 'TOP 15 people on Twitter (ETH)'}]),
                                                    dict(label = 'Reddit (BTC)',
                                                         method = 'update',
                                                         args = [{'visible': [False, False, False, True, False]},
                                                                 {'title': 'TOP 5 people on Reddit (BTC)'}]),

                                                    dict(label = 'Reddit (ETH)',
                                                         method = 'update',
                                                         args = [{'visible': [False, False, False, False, True]},
                                                                 {'title': 'TOP 5 people on Reddit (ETH)'}]),
                                                    dict(label = 'Reset: show all',
                                                         method = 'update',
                                                         args = [{'visible': [True, True, True, True, True]},
                                                                 {'title': 'TOP people on CCN news'}])

                                                ]),
                                                direction = 'down',
                                                pad = {'r': 10, 't': 10},
                                                showactive = True,
                                                x = 0,
                                                xanchor = 'right',
                                                y = 1.2,
                                                yanchor = 'top'
                                            )
                                        ])
                                        ,
                                        xaxis = dict(title = 'Number of occurrences'),
                                        yaxis = dict(title = ''),
                                        margin=dict(
                                        l=250,
                                        r=20,
                                        t=70,
                                        b=70,
                )
                                )}
        ),
# BTC/ETH values over time
html.H3("You can also look at the recent development in the currency values. If you are interested in a specific time interval, \
you can zoom in by selecting the desired period. If you click at the small house icon, you can reset the axis again."),
                    dcc.Graph(id='scatterplot3',
                    figure = {'data':[
                            go.Scatter(
                            x = btc_values_df.timestamp,
                            y = btc_values_df.close,
                            name = 'BTC',
                            mode = 'markers+lines'
                            ),
                                go.Scatter(
                                x = btc_values_df.timestamp,
                                y = [btc_values_df.close.mean()]*len(btc_values_df.timestamp),
                                name = 'BTC Average',
                                visible = False,
                                line=dict(color='#33CFA5', dash='dash')
                                ),
                            go.Scatter(
                            x = eth_values_df.timestamp,
                            y = eth_values_df.close,
                            name = 'ETH',
                            mode = 'markers+lines'
                            ),
                                go.Scatter(
                                x = eth_values_df.timestamp,
                                y = [eth_values_df.close.mean()]*len(eth_values_df.timestamp),
                                name = 'ETH Average',
                                visible = False,
                                line=dict(color='#33CFA5', dash='dash')
                                )
                            ],
                    'layout':go.Layout(title = 'BTC and ETH values over time', showlegend=True,
                                        updatemenus = list([
                                            dict(active=-1,
                                                 buttons=list([
                                                    dict(label = 'BTC',
                                                         method = 'update',
                                                         args = [{'visible': [True, True, False, False]},
                                                                 {'title': 'BTC values over time',
                                                                 'annotations': [
                                                                 dict(x=btc_values_df.iloc[btc_values_df.close.idxmax()]["timestamp"],
                                                                   y=btc_values_df.close.max(),
                                                                   xref='x', yref='y',
                                                                   text='Max value:<br>'+str(btc_values_df.close.max()),
                                                                   ax=0, ay=-40),
                                                                 dict(x='2017-09-01 00:00:00',
                                                                     y=btc_values_df.close.mean(),
                                                                     xref='x', yref='y',
                                                                     text='Average value in the displayed time period:<br>'+str(round(btc_values_df.close.mean(), 2)),
                                                                     ax=0, ay=-40)
                                                                 ]},
                                                                 ]),
                                                    dict(label = 'ETH',
                                                         method = 'update',
                                                         args = [{'visible': [False, False, True, True]},
                                                                 {'title': 'ETH values over time',
                                                                 'annotations': [
                                                                 dict(x=eth_values_df.iloc[eth_values_df.close.idxmax()]["timestamp"],
                                                                   y=eth_values_df.close.max(),
                                                                   xref='x', yref='y',
                                                                   text='Max value:<br>'+str(eth_values_df.close.max()),
                                                                   ax=0, ay=-40),
                                                                dict(x='2017-09-01 00:00:00',
                                                                     y=eth_values_df.close.mean(),
                                                                     xref='x', yref='y',
                                                                     text='Average value in the displayed time period:<br>'+str(round(eth_values_df.close.mean(), 2)),
                                                                     ax=0, ay=-40)
                                                                 ]}]),
                                                    dict(label = 'Both',
                                                         method = 'update',
                                                         args = [{'visible': [True, False, True, False]},
                                                                 {'title': 'BTC and ETH values over time',
                                                                 'annotations': []}])
                                                ]),
                                            )
                                        ]),
                                        xaxis = {'title':'Time'},
                                        yaxis = {'title':'Value (in USD)'}
                                        )}
                                        ),
# BoW plot
html.H3("Additionally, you can see the most common words that are used in all discussions around Bitcoin and Ethereum on diverse channels. \
You can select the channel and the currency you are interested in from the dropdown menu on the left."),
                    dcc.Graph(id='barplot4',
                    figure = {'data':[
                    go.Bar(
                        x=generate_word_list(text_col= btc_reddit.title).word,
                        y=generate_word_list(text_col= btc_reddit.title).word_count,
                        name = 'BTC words on Reddit',
                        visible=True,
                        marker=dict(color='#f2a900') # set the marker color to gold
                    ),
                    go.Bar(
                        x=generate_word_list(text_col = eth_reddit.title).word,
                        y=generate_word_list(text_col = eth_reddit.title).word_count,
                        name = 'ETH words on Reddit',
                        visible=True,
                        marker=dict(color='#4d4d4e') # set the marker color to silver
                    ),
                    go.Bar(
                        x=generate_word_list(text_col = twitter_btc_df.text).word,
                        y=generate_word_list(text_col = twitter_btc_df.text).word_count,
                        name = 'BTC words on Twitter',
                        visible=False,
                        marker=dict(color='#f2a900') # set the marker color to gold
                    ),
                    go.Bar(
                        x=generate_word_list(text_col = twitter_eth_df.text).word,
                        y=generate_word_list(text_col = twitter_eth_df.text).word_count,
                        name = 'ETH words on Twitter',
                        visible=False,
                        marker=dict(color='#4d4d4e') # set the marker color to silver
                    ),
                    go.Bar(
                        x=generate_word_list(text_col = ccn_df.article).word,
                        y=generate_word_list(text_col = ccn_df.article).word_count,
                        name = 'Top words in Cryptocurrency News',
                        visible=False,
                        marker=dict(color='#f2a900') # set the marker color to gold
                    )
                    ],
                    'layout':go.Layout(title = str(n) +' most common words currently used in Bitcoin/Ethereum discussions', showlegend=True,
                                        updatemenus = list([
                                            dict(active=-1,
                                                 buttons=list([
                                                    dict(label = 'BTC words on Reddit',
                                                         method = 'update',
                                                         args = [{'visible': [True, False, False, False, False]},
                                                                 {'title': str(n) + ' most common words currently used about Bitcoin on Reddit'}]),
                                                    dict(label = 'ETH words on Reddit',
                                                         method = 'update',
                                                         args = [{'visible': [False, True, False, False, False]},
                                                                 {'title': str(n) + ' most common words currently used about Ethereum on Reddit'}]),
                                                    dict(label = 'Both Reddit',
                                                         method = 'update',
                                                         args = [{'visible': [True, True, False, False, False]},
                                                                 {'title': str(n)+ ' most common words currently used about Bitcoin and Ethereum on Reddit'}]),
                                                    dict(label = 'BTC words on Twitter',
                                                         method = 'update',
                                                         args = [{'visible': [False, False, True, False, False]},
                                                                 {'title': str(n) + ' most common words currently used about Bitcoin on Twitter'}]),
                                                    dict(label = 'ETH words on Twitter',
                                                         method = 'update',
                                                         args = [{'visible': [False, False, False, True, False]},
                                                                 {'title': str(n) + ' most common words currently used about Ethereum on Twitter'}]),
                                                    dict(label = 'Both Twitter',
                                                         method = 'update',
                                                         args = [{'visible': [False, False, True, True, False]},
                                                                 {'title': str(n) + ' most common words currently used about Bitcoin and Ethereum on Twitter'}]),
                                                    dict(label = 'Cryptocurrency News',
                                                         method = 'update',
                                                         args = [{'visible': [False, False, False, False, True]},
                                                                 {'title': str(n) + ' most common words currently used about Bitcoin and Ethereum in the News'}])
                                                ]),
                                            )
                                        ])
                                        ,
                                        xaxis = {'title':'Word'},
                                        yaxis = {'title':'Word count'}
                                    )}
                                        ),
# Interactive tables to inspect raw data
                html.Div([
                html.H2('Let\'s go more into detail: Reddit data'),
                dt.DataTable(
                    rows = reddit.to_dict('records'),
                    filterable=True,
                    sortable=True
                )]),
                html.Div([
                html.H2('Twitter data'),
                dt.DataTable(
                rows = twitter.to_dict('records'),
                filterable=True,
                sortable=True
                )]),
                html.Div([
                html.H2('The News'),
                dt.DataTable(
                rows = ccn_df.to_dict('records'),
                filterable=True,
                sortable=True
                )])
            ])

if __name__ == '__main__':
    app.run_server()

Reddit preprocessing

In [45]:
import re
import numpy as np
import pandas as pd
from textblob import TextBlob
from sqlalchemy import create_engine
from sqlalchemy.types import String, Integer, Float, Boolean, DateTime
import requests
import json
In [46]:
engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
connection = engine.connect()
In [47]:
btc_reddit = pd.read_sql(sql = "select distinct title, created_utc, \"SA_score_grouped\",\
                \"SA_score\" from btc_reddit order by created_utc desc", con = connection, index_col=None)
eth_reddit = pd.read_sql(sql = "select distinct title, created_utc, \"SA_score_grouped\",\
                \"SA_score\" from eth_reddit order by created_utc desc", con = connection, index_col=None)
In [48]:
btc_reddit.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 4 columns):
title               840 non-null object
created_utc         840 non-null datetime64[ns]
SA_score_grouped    840 non-null object
SA_score            840 non-null float64
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 26.3+ KB
In [49]:
eth_reddit.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 4 columns):
title               364 non-null object
created_utc         364 non-null datetime64[ns]
SA_score_grouped    364 non-null object
SA_score            364 non-null float64
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 11.5+ KB
In [50]:
# to later display the data table to filter and sort data for both BTC and ETH
reddit = pd.concat([btc_reddit, eth_reddit], axis=0, join='outer', # to get UNION of rows, instead of intersection
          join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

reddit.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1204 entries, 0 to 363
Data columns (total 4 columns):
title               1204 non-null object
created_utc         1204 non-null datetime64[ns]
SA_score_grouped    1204 non-null object
SA_score            1204 non-null float64
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 47.0+ KB
In [51]:
reddit.head()
Out[51]:
title created_utc SA_score_grouped SA_score
0 Is Bitcoin Feasible as an Institutional Invest... 2018-06-30 19:50:32 neutral 0.000
1 This is the most BS article I've come across s... 2018-06-30 19:10:44 positive 0.200
2 Bitcoin and Ether Surge 11% as Crypto Market S... 2018-06-30 19:06:04 neutral 0.000
3 Rumors that a huge flood in Sichuan took out a... 2018-06-30 18:40:41 positive 0.400
4 Bitcoin & Beyond: Can Blockchain Bring In 'The... 2018-06-30 18:33:42 positive 0.175
In [52]:
reddit.SA_score = round(reddit.SA_score,2)
reddit.head()
Out[52]:
title created_utc SA_score_grouped SA_score
0 Is Bitcoin Feasible as an Institutional Invest... 2018-06-30 19:50:32 neutral 0.00
1 This is the most BS article I've come across s... 2018-06-30 19:10:44 positive 0.20
2 Bitcoin and Ether Surge 11% as Crypto Market S... 2018-06-30 19:06:04 neutral 0.00
3 Rumors that a huge flood in Sichuan took out a... 2018-06-30 18:40:41 positive 0.40
4 Bitcoin & Beyond: Can Blockchain Bring In 'The... 2018-06-30 18:33:42 positive 0.18
In [53]:
# for later range in plots
minDate = btc_reddit["created_utc"].min()
maxDate = btc_reddit["created_utc"].max()
In [54]:
btc_grouped = btc_reddit[["SA_score", "SA_score_grouped"]].groupby("SA_score_grouped").count()
btc_grouped["sentiment"] = btc_grouped.index
btc_grouped.reset_index(drop=True, inplace=True)
btc_grouped.rename(columns={"SA_score": "nr_of_tweets"}, inplace=True)
btc_grouped
Out[54]:
nr_of_tweets sentiment
0 95 negative
1 499 neutral
2 246 positive
In [55]:
eth_grouped = eth_reddit[["SA_score", "SA_score_grouped"]].groupby("SA_score_grouped").count()
eth_grouped["sentiment"] = eth_grouped.index
eth_grouped.reset_index(drop=True, inplace=True)
eth_grouped.rename(columns={"SA_score": "nr_of_tweets"}, inplace=True)
eth_grouped
Out[55]:
nr_of_tweets sentiment
0 40 negative
1 191 neutral
2 133 positive

BTC and ETH values over time - Preprocessing

In [56]:
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=365'
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
btc_values_df = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
btc_values_df["timestamp"] = pd.to_datetime(btc_values_df["time"], unit='s') 
# clean the date format: up to seconds, without miliseconds
btc_values_df.tail()
Out[56]:
close high low open time volumefrom volumeto timestamp
361 6088.39 6286.36 6067.49 6260.40 1529971200 62925.30 3.913967e+08 2018-06-26
362 6141.57 6192.11 6021.69 6092.26 1530057600 62639.82 3.834571e+08 2018-06-27
363 5871.28 6172.23 5844.26 6141.57 1530144000 63585.51 3.859070e+08 2018-06-28
364 6203.80 6286.67 5813.02 5870.81 1530230400 92884.98 5.547990e+08 2018-06-29
365 6327.85 6503.33 6194.51 6203.81 1530316800 58471.09 3.740548e+08 2018-06-30
In [57]:
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=ETH&tsym=USD&limit=365'
r = requests.get(url)
json_data = r.json() # Decode the JSON data into a dictionary: json_data
eth_values_df = pd.DataFrame(json_data["Data"])
eth_values_df["timestamp"] = pd.to_datetime(eth_values_df["time"], unit='s') 
# clean the date format: converts the unix timestamp to pandas date data type
eth_values_df.tail()
Out[57]:
close high low open time volumefrom volumeto timestamp
361 429.58 461.18 429.38 458.81 1529971200 457008.59 2.040352e+08 2018-06-26
362 441.75 446.12 419.84 429.89 1530057600 421578.16 1.823433e+08 2018-06-27
363 420.72 443.51 417.62 441.75 1530144000 346521.20 1.497883e+08 2018-06-28
364 435.25 441.87 405.01 420.70 1530230400 544657.78 2.275647e+08 2018-06-29
365 443.97 462.56 435.16 435.26 1530316800 328395.44 1.477416e+08 2018-06-30

Twitter preprocessing

In [58]:
twitter_df = pd.read_sql(sql = "select distinct text, tweet_created from twitter3 TABLESAMPLE SYSTEM(1) \
    where text ~* '(btc|#eth|ether|bitcoin|ethereum)' order by tweet_created desc", con = connection, index_col=None)
twitter_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 2 columns):
text             2889 non-null object
tweet_created    2889 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 45.2+ KB
In [59]:
twitter_df.head(3)
Out[59]:
text tweet_created
0 RT @FairNinja: Buy and sell with anyone, anywh... 2018-06-30 20:10:30
1 Cryptocurrency Investing for Dummies. Top 20 C... 2018-06-30 20:10:25
2 RT @btc: On average #Bitcoin adds a zero every... 2018-06-30 20:10:25
In [60]:
twitter_df.tail(3)
Out[60]:
text tweet_created
2886 Kokos Token Free #Airdrop:\nGet more than 500$... 2018-06-17 19:43:18
2887 Make money online with the best automated syst... 2018-06-17 19:43:18
2888 Sometimes it’t good to zoom out and get some #... 2018-06-17 19:43:16

Splitting based on keywords so that we have 2 separate datasets: one for BTC and one for ETH

In [61]:
list_of_tweets = twitter_df.text.tolist()

eth_tweets = [tweet for tweet in list_of_tweets if \
              len(re.findall(r"(ethereum|Ethereum|ETH|ETC|Ethereum Classic|EthereumClassic|ether|eth)", tweet)) > 0]

btc_tweets = [tweet for tweet in list_of_tweets if \
              len(re.findall(r"(bitcoin|Bitcoin|BTC|BitCoin|bitCoin|BitcoinClassic|Bitcoin Classic|bitcoinclassic|bitcoinClassic|XBT)", tweet)) > 0]

# Sentiment analysis
def get_sentiment(sentence):
    analysis = TextBlob(sentence)
    return(analysis.sentiment.polarity) # > 0 positive, < 0 negative

btc_twitter_sa = [get_sentiment(sentence) for sentence in btc_tweets]
twitter_btc_df = pd.DataFrame({"text":btc_tweets, "SA_score":btc_twitter_sa})
twitter_btc_df = pd.merge(twitter_btc_df, twitter_df, how='inner', on="text")

eth_twitter_sa = [get_sentiment(sentence) for sentence in eth_tweets]
twitter_eth_df = pd.DataFrame({"text":eth_tweets, "SA_score":eth_twitter_sa})
twitter_eth_df = pd.merge(twitter_eth_df, twitter_df, how="inner", on="text")
In [62]:
twitter_btc_df.head(3)
Out[62]:
SA_score text tweet_created
0 0.780000 RT @FairNinja: Buy and sell with anyone, anywh... 2018-06-30 20:10:30
1 0.333333 Cryptocurrency Investing for Dummies. Top 20 C... 2018-06-30 20:10:25
2 -0.150000 RT @btc: On average #Bitcoin adds a zero every... 2018-06-30 20:10:25
In [63]:
twitter_btc_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3824 entries, 0 to 3823
Data columns (total 3 columns):
SA_score         3824 non-null float64
text             3824 non-null object
tweet_created    3824 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 119.5+ KB
In [64]:
twitter_eth_df.head(3)
Out[64]:
SA_score text tweet_created
0 0.78 RT @FairNinja: Buy and sell with anyone, anywh... 2018-06-30 20:10:30
1 0.00 RT @Abdo63163776: @emporeumorg using hashtags ... 2018-06-30 19:29:33
2 0.00 RT @JackPosobiec: Truly stunning to see the #K... 2018-06-30 18:06:38
In [65]:
twitter_eth_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2634 entries, 0 to 2633
Data columns (total 3 columns):
SA_score         2634 non-null float64
text             2634 non-null object
tweet_created    2634 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 82.3+ KB
In [66]:
# to later display the data table to filter and sort data for both BTC and ETH
twitter = pd.concat([twitter_btc_df, twitter_eth_df], axis=0, join='outer', # to get UNION of rows, instead of intersection
          join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

Remove duplicates for detailed sentiment plot

In [67]:
twitter_btc_df2 = twitter_btc_df.drop_duplicates(subset = ["text"], keep='first', inplace=False)
twitter_btc_df2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1989 entries, 0 to 3823
Data columns (total 3 columns):
SA_score         1989 non-null float64
text             1989 non-null object
tweet_created    1989 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 62.2+ KB
In [68]:
twitter_eth_df2 = twitter_eth_df.drop_duplicates(subset = ["text"], keep='first', inplace=False)
twitter_eth_df2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 692 entries, 0 to 2633
Data columns (total 3 columns):
SA_score         692 non-null float64
text             692 non-null object
tweet_created    692 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 21.6+ KB

Preprocessing for aggregated pie chart

In [69]:
def get_short_sentiment(sentence):
    '''function to classify sentiment of passed SA score'''
    if sentence > 0.05:
        return 'positive'
    elif sentence <= 0.05 and sentence > -0.005:
        return 'neutral'
    else:
        return 'negative'

short_twitter_btc = [get_short_sentiment(t) for t in twitter_btc_df.SA_score] # twitter_btc
short_twitter_eth = [get_short_sentiment(t) for t in twitter_eth_df.SA_score] # twitter_eth

twitter_btc_grouped = pd.DataFrame({"nr_of_tweets":twitter_btc_df.text, "short":short_twitter_btc}).groupby("short")
twitter_eth_grouped = pd.DataFrame({"nr_of_tweets":twitter_eth_df.text, "short":short_twitter_eth}).groupby("short")

twitter_btc_grouped = twitter_btc_grouped.count()
twitter_eth_grouped = twitter_eth_grouped.count()

twitter_btc_grouped["sentiment"] = twitter_btc_grouped.index
twitter_eth_grouped["sentiment"] = twitter_eth_grouped.index

twitter_btc_grouped.reset_index(drop=True, inplace=True)
twitter_eth_grouped.reset_index(drop=True, inplace=True)
In [70]:
twitter_btc_grouped
Out[70]:
nr_of_tweets sentiment
0 289 negative
1 1920 neutral
2 1615 positive
In [71]:
twitter_eth_grouped
Out[71]:
nr_of_tweets sentiment
0 91 negative
1 1447 neutral
2 1096 positive

News preprocessing

In [72]:
ccn = pd.read_sql(sql = "select distinct article, date from ccn_articles order by date desc", 
                         con = connection, index_col=None)
ccn.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2711 entries, 0 to 2710
Data columns (total 2 columns):
article    2711 non-null object
date       2711 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 42.4+ KB
In [73]:
ccn_sa = [get_sentiment(sentence) for sentence in ccn.article]
ccn_sa_df = pd.DataFrame({"article":ccn.article, "SA_score":ccn_sa})
ccn_df = pd.merge(ccn_sa_df, ccn, how='inner', on="article")
ccn_df.head(1)
Out[73]:
SA_score article date
0 -0.012143 The U.S. Supreme Court has denied Ross Ulbric... 2018-06-28 16:20:00
In [74]:
short_ccn = [get_short_sentiment(t) for t in ccn_df.SA_score] # twitter_btc
ccn_grouped = pd.DataFrame({"nr_of_articles":ccn_df.article, "short":short_ccn}).groupby("short").count()
ccn_grouped["sentiment"] = ccn_grouped.index
ccn_grouped.reset_index(drop=True, inplace=True)
ccn_grouped
Out[74]:
nr_of_articles sentiment
0 202 negative
1 639 neutral
2 1870 positive

Simple BoW model

In [75]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize

n = 10
def generate_word_list(text_col, nr_words = n):
    tokens = word_tokenize(text_col.to_string()) # tokenize
    lower_tokens = [t.lower() for t in tokens] # Convert the tokens into lowercase: lower_tokens
    alpha_only = [t for t in lower_tokens if t.isalpha()] # Retain alphabetic words: alpha_only
    stopwords = nltk.corpus.stopwords.words('english') # Remove all stop words: no_stops
    newStopWords = ["rt", "bitcoin", "crypto", "cryptocurrency", "blockchain", "btc", "bitcoi", "bitcoins",
                   "price", "ethereum", "eth", "classic", "exchange", "market", "cryptocurrencie", 
                   "cryptocurrencies", "http", "htttp", "hour", "list", "u"]
    stopwords.extend(newStopWords)
    no_stops = [t for t in alpha_only if t not in stopwords]
    wordnet_lemmatizer = WordNetLemmatizer() # Instantiate the WordNetLemmatizer
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops if len(t)>1] # Lemmatize all tokens into a new list
    bow = Counter(lemmatized) # Create the bag-of-words: bow
    word = []
    word_count = []
    for i in range(nr_words):
        word.append(bow.most_common(nr_words)[i][0])
        word_count.append(bow.most_common(nr_words)[i][1])
    words_and_counts_df = pd.DataFrame({"word":word, "word_count":word_count})
    return(words_and_counts_df) # return the n most common tokens
In [76]:
generate_word_list(text_col = btc_reddit.title)
Out[76]:
word word_count
0 cash 30
1 new 27
2 could 23
3 future 22
4 watch 22
5 say 22
6 hit 17
7 bank 15
8 analysis 15
9 ceo 14
In [77]:
generate_word_list(text_col = eth_reddit.title)
Out[77]:
word word_count
0 tether 66
1 new 14
2 token 14
3 eos 13
4 tron 8
5 add 8
6 analysis 8
7 ripple 7
8 first 7
9 million 7
In [78]:
generate_word_list(text_col = twitter_btc_df.text)
Out[78]:
word word_count
0 new 598
1 listing 535
2 kucoincom 534
3 launching 534
4 giveaway 240
5 gamdomofficial 228
6 wabnetwork 171
7 blockcha 170
8 wab 170
9 facelift 169
In [79]:
generate_word_list(text_col = twitter_eth_df.text)
Out[79]:
word word_count
0 new 552
1 check 549
2 kucoincom 534
3 launching 533
4 listing 532
5 vi 530
6 short 529
7 explainer 529
8 token 238
9 airdrop 189

Inline Visualizations within Jupyter Notebook

In [4]:
import plotly.plotly as py
import plotly.graph_objs as go 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
init_notebook_mode(connected=True) # connects javaScript to this notebook, since it connects plotly and pandas 
# to interactive javaScript library for offline use
cf.go_offline()
%matplotlib inline
In [81]:
figure = {'data':[
    go.Scatter(
    x = btc_reddit.created_utc,
    y = btc_reddit.SA_score,
    name = "BTC Sentiment on Reddit",
    visible=True,
    marker=dict(color='#f2a900'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = eth_reddit.created_utc,
    y = eth_reddit.SA_score,
    name = "ETH Sentiment on Reddit",
    visible=True,
    marker=dict(color='#4d4d4e'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = twitter_btc_df2.tweet_created,
    y = twitter_btc_df2.SA_score,
    name = "BTC Sentiment on Twitter",
    visible=False,
    marker=dict(color='#f2a900'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = twitter_eth_df2.tweet_created,
    y = twitter_eth_df2.SA_score,
    name = "ETH Sentiment on Twitter",
    visible=False,
    marker=dict(color='#4d4d4e'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = ccn_df.date[np.logical_and(ccn_df.date >= minDate, ccn_df.date <= maxDate)],
    y = ccn_df.SA_score[np.logical_and(ccn_df.date >= minDate, ccn_df.date <= maxDate)],
    name = "BTC and ETH Sentiment in the News",
    visible=False,
    marker=dict(color='#4d4d4e'),
    mode = 'markers+lines'
    )
],
    'layout':go.Layout(title = 'BTC and ETH sentiment over time', showlegend=True,
                        updatemenus = list([
                            dict(active=-1,
                                 buttons=list([
                                    dict(label = 'BTC Sentiment on Reddit',
                                         method = 'update',
                                         args = [{'visible': [True, False, False, False, False]},
                                                 {'title': 'BTC sentiment over time on Reddit'}]),
                                    dict(label = 'ETH Sentiment on Reddit',
                                         method = 'update',
                                         args = [{'visible': [False, True, False, False, False]},
                                                 {'title': 'ETH sentiment over time on Reddit'}]),
                                    dict(label = 'Both: Sentiment on Reddit',
                                         method = 'update',
                                         args = [{'visible': [True, True, False, False, False]},
                                                 {'title': 'BTC and ETH sentiment over time on Reddit'}]),
                                    dict(label = 'BTC Sentiment on Twitter',
                                         method = 'update',
                                         args = [{'visible': [False, False, True, False, False]},
                                                 {'title': 'BTC sentiment over time on Twitter'}]),
                                    dict(label = 'ETH Sentiment on Twitter',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, True, False]},
                                                 {'title': 'ETH sentiment over time on Twitter'}]),
                                    dict(label = 'Both: Sentiment on Twitter',
                                         method = 'update',
                                         args = [{'visible': [False, False, True, True, False]},
                                                 {'title': 'BTC and ETH sentiment over time on Twitter'}]),
                                    dict(label = 'BTC & ETH Sentiment in the News',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, False, True]},
                                                 {'title': 'BTC and ETH Sentiment in the News'}])
                                ]),
                            )
                        ])
                        ,
                        xaxis = dict(title = 'Time'), #, range = [minDate, maxDate]),
                        yaxis = dict(title = 'Sentiment')
                )}
iplot(go.Figure(figure))
In [82]:
figure = {'data':[
    go.Pie(
        labels=btc_grouped.sentiment,
        values=btc_grouped.nr_of_tweets,
        name = 'BTC Sentiment on Reddit',
        visible=True,
        marker=dict(colors=['#fc586e', '#fffaaa', '#87d686']) # set the colors to red, yellow and green for pie chart
    ),
    go.Pie(
        labels=eth_grouped.sentiment,
        values=eth_grouped.nr_of_tweets,
        name = 'ETH Sentiment on Reddit',
        visible=False,
        marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
    ),
    go.Pie(
        labels=twitter_btc_grouped.sentiment,
        values=twitter_btc_grouped.nr_of_tweets,
        name = 'BTC Sentiment on Twitter',
        visible=False,
        marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
    ),
    go.Pie(
        labels=twitter_eth_grouped.sentiment,
        values=twitter_eth_grouped.nr_of_tweets,
        name = 'ETH Sentiment on Twitter',
        visible=False,
        marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
    ),
    go.Pie(
        labels=ccn_grouped.sentiment,
        values=ccn_grouped.nr_of_articles,
        name = 'BTC and ETH Sentiment in the News',
        visible=False,
        marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
    )
    ],
    'layout':go.Layout(title = 'BTC sentiment on Reddit', showlegend=True,
                        updatemenus = list([
                            dict(active=-1,
                                 buttons=list([
                                    dict(label = 'BTC sentiment on Reddit',
                                         method = 'update',
                                         args = [{'visible': [True, False, False, False, False]},
                                                 {'title': 'BTC sentiment on Reddit'}]),
                                    dict(label = 'ETH sentiment on Reddit',
                                         method = 'update',
                                         args = [{'visible': [False, True, False, False, False]},
                                                 {'title': 'ETH sentiment on Reddit'}]),
                                     dict(label = 'BTC sentiment on Twitter',
                                          method = 'update',
                                          args = [{'visible': [False, False, True, False, False]},
                                                  {'title': 'BTC sentiment on Twitter'}]),
                                     dict(label = 'ETH sentiment on Twitter',
                                          method = 'update',
                                          args = [{'visible': [False, False, False, True, False]},
                                                  {'title': 'ETH sentiment on Twitter'}]),
                                    dict(label = 'BTC & ETH Sentiment in the News',
                                          method = 'update',
                                          args = [{'visible': [False, False, False, False, True]},
                                                  {'title': 'BTC and ETH Sentiment in the News'}])
                                ]),
                            )
                        ])
                    )}

iplot(go.Figure(figure))
In [83]:
figure = {'data':[
    go.Scatter(
    x = btc_values_df.timestamp,
    y = btc_values_df.close,
    name = 'BTC',
    mode = 'markers+lines'
    ),
        go.Scatter(
        x = btc_values_df.timestamp,
        y = [btc_values_df.close.mean()]*len(btc_values_df.timestamp),
        name = 'BTC Average',
        visible = False,
        line=dict(color='#33CFA5', dash='dash')
        ),
    go.Scatter(
    x = eth_values_df.timestamp,
    y = eth_values_df.close,
    name = 'ETH',
    mode = 'markers+lines'
    ),
        go.Scatter(
        x = eth_values_df.timestamp,
        y = [eth_values_df.close.mean()]*len(eth_values_df.timestamp),
        name = 'ETH Average',
        visible = False,
        line=dict(color='#33CFA5', dash='dash')
        )
    ],
'layout':go.Layout(title = 'BTC and ETH values over time', showlegend=True,
                updatemenus = list([
                    dict(active=-1,
                         buttons=list([
                            dict(label = 'BTC',
                                 method = 'update',
                                 args = [{'visible': [True, True, False, False]},
                                         {'title': 'BTC values over time',
                                         'annotations': [
                                         dict(x=btc_values_df.iloc[btc_values_df.close.idxmax()]["timestamp"],
                                           y=btc_values_df.close.max(),
                                           xref='x', yref='y',
                                           text='Max value:<br>'+str(btc_values_df.close.max()),
                                           ax=0, ay=-40),
                                         dict(x='2017-09-01 00:00:00',
                                             y=btc_values_df.close.mean(),
                                             xref='x', yref='y',
                                             text='Average value in the displayed time period:<br>'+str(round(btc_values_df.close.mean(), 2)),
                                             ax=0, ay=-40)
                                         ]},
                                         ]),
                            dict(label = 'ETH',
                                 method = 'update',
                                 args = [{'visible': [False, False, True, True]},
                                         {'title': 'ETH values over time',
                                         'annotations': [
                                         dict(x=eth_values_df.iloc[eth_values_df.close.idxmax()]["timestamp"],
                                           y=eth_values_df.close.max(),
                                           xref='x', yref='y',
                                           text='Max value:<br>'+str(eth_values_df.close.max()),
                                           ax=0, ay=-40),
                                        dict(x='2017-09-01 00:00:00',
                                             y=eth_values_df.close.mean(),
                                             xref='x', yref='y',
                                             text='Average value in the displayed time period:<br>'+str(round(eth_values_df.close.mean(), 2)),
                                             ax=0, ay=-40)
                                         ]}]),
                            dict(label = 'Both',
                                 method = 'update',
                                 args = [{'visible': [True, False, True, False]},
                                         {'title': 'BTC and ETH values over time',
                                         'annotations': []}])
                        ]),
                    )
                ]),
                xaxis = {'title':'Time'},
                yaxis = {'title':'Value (in USD)'}
                )}

iplot(go.Figure(figure))
In [84]:
figure = {'data':[
    go.Bar(
        x=generate_word_list(text_col= btc_reddit.title).word,
        y=generate_word_list(text_col= btc_reddit.title).word_count,
        name = 'BTC words on Reddit',
        visible=True,
        marker=dict(color='#f2a900') # set the marker color to gold
    ),
    go.Bar(
        x=generate_word_list(text_col = eth_reddit.title).word,
        y=generate_word_list(text_col = eth_reddit.title).word_count,
        name = 'ETH words on Reddit',
        visible=True,
        marker=dict(color='#4d4d4e') # set the marker color to silver
    ),
    go.Bar(
        x=generate_word_list(text_col = twitter_btc_df.text).word,
        y=generate_word_list(text_col = twitter_btc_df.text).word_count,
        name = 'BTC words on Twitter',
        visible=False,
        marker=dict(color='#f2a900') # set the marker color to gold
    ),
    go.Bar(
        x=generate_word_list(text_col = twitter_eth_df.text).word,
        y=generate_word_list(text_col = twitter_eth_df.text).word_count,
        name = 'ETH words on Twitter',
        visible=False,
        marker=dict(color='#4d4d4e') # set the marker color to silver
    ),
    go.Bar(
        x=generate_word_list(text_col = ccn_df.article).word,
        y=generate_word_list(text_col = ccn_df.article).word_count,
        name = 'Top words in Cryptocurrency News',
        visible=False,
        marker=dict(color='#f2a900') # set the marker color to gold
    )
    ],
    'layout':go.Layout(title = str(n) +' most common words currently used in Bitcoin/Ethereum discussions', showlegend=True,
                        updatemenus = list([
                            dict(active=-1,
                                 buttons=list([
                                    dict(label = 'BTC words on Reddit',
                                         method = 'update',
                                         args = [{'visible': [True, False, False, False, False]},
                                                 {'title': str(n) + ' most common words currently used about Bitcoin on Reddit'}]),
                                    dict(label = 'ETH words on Reddit',
                                         method = 'update',
                                         args = [{'visible': [False, True, False, False, False]},
                                                 {'title': str(n) + ' most common words currently used about Ethereum on Reddit'}]),
                                    dict(label = 'Both Reddit',
                                         method = 'update',
                                         args = [{'visible': [True, True, False, False, False]},
                                                 {'title': str(n)+ ' most common words currently used about Bitcoin and Ethereum on Reddit'}]),
                                    dict(label = 'BTC words on Twitter',
                                         method = 'update',
                                         args = [{'visible': [False, False, True, False, False]},
                                                 {'title': str(n) + ' most common words currently used about Bitcoin on Twitter'}]),
                                    dict(label = 'ETH words on Twitter',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, True, False]},
                                                 {'title': str(n) + ' most common words currently used about Ethereum on Twitter'}]),
                                    dict(label = 'Both Twitter',
                                         method = 'update',
                                         args = [{'visible': [False, False, True, True, False]},
                                                 {'title': str(n) + ' most common words currently used about Bitcoin and Ethereum on Twitter'}]),
                                    dict(label = 'Cryptocurrency News',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, False, True]},
                                                 {'title': str(n) + ' most common words currently used about Bitcoin and Ethereum in the News'}])
                                ]),
                            )
                        ])
                        ,
                        xaxis = {'title':'Word'},
                        yaxis = {'title':'Word count'}
                    )}

iplot(go.Figure(figure))

Aggregate Sentiment by day

Reddit

In [85]:
btc_reddit.head(2)
Out[85]:
title created_utc SA_score_grouped SA_score
0 Is Bitcoin Feasible as an Institutional Invest... 2018-06-30 19:50:32 neutral 0.0
1 This is the most BS article I've come across s... 2018-06-30 19:10:44 positive 0.2
In [86]:
eth_reddit.head(2)
Out[86]:
title created_utc SA_score_grouped SA_score
0 Trying to transfer Ethereum help. 2018-06-30 18:36:16 neutral 0.0
1 Cryptocurrency fund bullish on Ethereum and EO... 2018-06-30 17:11:47 neutral 0.0
In [87]:
# Reddit
ts_btc_reddit = btc_reddit.set_index("created_utc", inplace=False)
ts_btc_reddit = ts_btc_reddit.SA_score.resample('D').mean()
ts_eth_reddit = eth_reddit.set_index("created_utc", inplace=False)
ts_eth_reddit = ts_eth_reddit.SA_score.resample('D').mean()
standardized_reddit_scores = pd.DataFrame({'BTC':ts_btc_reddit,'ETH':ts_eth_reddit})
# Since the server might be down on certain days, we need to ensure that our time series has no discontinuities: interpolate() fills gaps of any size with a straight line
standardized_reddit_scores['BTC'].interpolate(method='linear', inplace=True)
standardized_reddit_scores['ETH'].interpolate(method='linear', inplace=True)
standardized_reddit_scores
Out[87]:
BTC ETH
created_utc
2018-06-14 0.050486 0.012603
2018-06-15 0.001904 -0.023598
2018-06-16 0.091964 0.129829
2018-06-17 0.052083 0.112980
2018-06-18 0.046581 0.054387
2018-06-19 0.070481 0.105602
2018-06-20 0.063813 0.049278
2018-06-21 0.031335 0.112072
2018-06-22 0.024002 0.067093
2018-06-23 0.061035 0.155357
2018-06-24 0.096405 0.086824
2018-06-25 0.061972 0.134950
2018-06-26 0.097100 0.125118
2018-06-27 0.070191 0.086448
2018-06-28 0.098293 0.111197
2018-06-29 0.071292 0.058147
2018-06-30 0.007854 0.109740

Twitter

In [88]:
twitter_btc_df.head(2)
Out[88]:
SA_score text tweet_created
0 0.780000 RT @FairNinja: Buy and sell with anyone, anywh... 2018-06-30 20:10:30
1 0.333333 Cryptocurrency Investing for Dummies. Top 20 C... 2018-06-30 20:10:25
In [89]:
twitter_eth_df.head(2)
Out[89]:
SA_score text tweet_created
0 0.78 RT @FairNinja: Buy and sell with anyone, anywh... 2018-06-30 20:10:30
1 0.00 RT @Abdo63163776: @emporeumorg using hashtags ... 2018-06-30 19:29:33
In [90]:
ts_twitter_btc_df = twitter_btc_df.set_index("tweet_created", inplace=False)
ts_twitter_btc_df = ts_twitter_btc_df.SA_score.resample('D').mean()
ts_twitter_eth_df = twitter_eth_df.set_index("tweet_created", inplace=False)
ts_twitter_eth_df = ts_twitter_eth_df.SA_score.resample('D').mean()
standardized_twitter_scores = pd.DataFrame({'BTC':ts_twitter_btc_df, 'ETH':ts_twitter_eth_df})
# Since the server might be down on certain days, we need to ensure that time series has no discontinuities: interpolate() fills gaps of any size with a straight line
standardized_twitter_scores['BTC'].interpolate(method='linear', inplace=True)
standardized_twitter_scores['ETH'].interpolate(method='linear', inplace=True)
standardized_twitter_scores
Out[90]:
BTC ETH
tweet_created
2018-06-17 0.156084 0.041667
2018-06-18 0.074546 0.057325
2018-06-19 0.100853 0.055352
2018-06-20 0.057486 0.108037
2018-06-21 0.073244 0.087592
2018-06-22 0.089223 0.105542
2018-06-23 0.135831 0.136281
2018-06-24 0.097578 0.258142
2018-06-25 0.286458 0.433333
2018-06-26 0.064118 0.059400
2018-06-27 0.064865 0.062356
2018-06-28 0.022337 -0.030357
2018-06-29 0.092447 0.083077
2018-06-30 0.095176 0.133764

News

In [91]:
ccn_df = ccn_df[ccn_df.date >= pd.to_datetime("2018-06-10")]
ccn_df.head(1)
Out[91]:
SA_score article date
0 -0.012143 The U.S. Supreme Court has denied Ross Ulbric... 2018-06-28 16:20:00
In [92]:
ccn_df.tail(1)
Out[92]:
SA_score article date
277 0.049577 The CEO of Lazard, Kenneth M. Jacobs, said th... 2018-06-10
In [93]:
ts_ccn = ccn_df.set_index("date", inplace=False)
ts_ccn = ts_ccn.SA_score.resample('D').mean()
standardized_ccn_scores = pd.DataFrame({'CCN':ts_ccn})
standardized_ccn_scores
Out[93]:
CCN
date
2018-06-10 0.063023
2018-06-11 0.063184
2018-06-12 0.066494
2018-06-13 0.069092
2018-06-14 0.118126
2018-06-15 0.029410
2018-06-16 0.093344
2018-06-17 0.101403
2018-06-18 0.057105
2018-06-19 0.067641
2018-06-20 0.075863
2018-06-21 0.085445
2018-06-22 0.079275
2018-06-23 0.089087
2018-06-24 0.074541
2018-06-25 0.054029
2018-06-26 0.077400
2018-06-27 0.085457
2018-06-28 0.072175

Plot it with values

In [94]:
len(standardized_reddit_scores)
Out[94]:
17
In [95]:
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=20' # + str(len(standardized_reddit_scores))
# we want only last 5 days
# &toTs=1522224000
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
btc_mini = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
btc_mini["timestamp"] = pd.to_datetime(btc_mini["time"], unit='s') 
btc_mini
Out[95]:
close high low open time volumefrom volumeto timestamp
0 6773.72 7514.04 6661.98 7513.76 1528588800 127797.43 8.955144e+08 2018-06-10
1 6887.37 6913.85 6652.89 6773.72 1528675200 71169.64 4.827881e+08 2018-06-11
2 6556.94 6888.33 6462.61 6887.43 1528761600 75997.29 5.093725e+08 2018-06-12
3 6310.43 6623.00 6140.63 6556.94 1528848000 124304.17 7.970873e+08 2018-06-13
4 6643.26 6719.44 6286.23 6310.43 1528934400 95038.67 6.202964e+08 2018-06-14
5 6396.71 6667.23 6380.69 6643.63 1529020800 57731.42 3.792286e+08 2018-06-15
6 6503.10 6569.15 6345.99 6396.71 1529107200 35851.38 2.328028e+08 2018-06-16
7 6457.78 6585.77 6446.38 6503.10 1529193600 34359.95 2.245755e+08 2018-06-17
8 6714.82 6802.03 6401.41 6457.78 1529280000 65285.57 4.302417e+08 2018-06-18
9 6741.28 6839.60 6672.20 6714.56 1529366400 54949.84 3.707971e+08 2018-06-19
10 6761.27 6817.90 6569.96 6741.28 1529452800 59674.47 4.005302e+08 2018-06-20
11 6720.64 6790.08 6687.66 6761.27 1529539200 43765.89 2.956907e+08 2018-06-21
12 6051.47 6731.51 5941.32 6720.64 1529625600 137346.53 8.628266e+08 2018-06-22
13 6166.54 6257.88 6033.20 6051.47 1529712000 57011.61 3.506974e+08 2018-06-23
14 6157.78 6254.02 5782.13 6166.36 1529798400 110833.06 6.674791e+08 2018-06-24
15 6260.35 6344.45 6095.25 6157.78 1529884800 79086.40 4.928342e+08 2018-06-25
16 6088.39 6286.36 6067.49 6260.40 1529971200 62925.30 3.913967e+08 2018-06-26
17 6141.57 6192.11 6021.69 6092.26 1530057600 62639.82 3.834571e+08 2018-06-27
18 5871.28 6172.23 5844.26 6141.57 1530144000 63585.51 3.859070e+08 2018-06-28
19 6203.80 6286.67 5813.02 5870.81 1530230400 92884.98 5.547990e+08 2018-06-29
20 6329.47 6503.33 6194.51 6203.81 1530316800 58471.09 3.740548e+08 2018-06-30
In [96]:
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=ETH&tsym=USD&limit=20' # we want only last 5 days
# &toTs=1522224000
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
eth_mini = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
eth_mini["timestamp"] = pd.to_datetime(eth_mini["time"], unit='s') 
eth_mini
Out[96]:
close high low open time volumefrom volumeto timestamp
0 524.74 593.35 501.68 593.35 1528588800 636718.45 3.453131e+08 2018-06-10
1 531.15 535.79 509.82 524.73 1528675200 505308.31 2.646738e+08 2018-06-11
2 494.53 539.34 484.70 531.15 1528761600 546980.39 2.799563e+08 2018-06-12
3 476.30 501.67 450.05 494.53 1528848000 752164.24 3.571677e+08 2018-06-13
4 519.83 526.64 460.01 476.30 1528934400 885354.26 4.401127e+08 2018-06-14
5 487.51 521.18 483.02 519.83 1529020800 510688.96 2.553048e+08 2018-06-15
6 497.22 503.28 483.99 487.51 1529107200 241642.76 1.196115e+08 2018-06-16
7 496.74 507.49 494.01 497.22 1529193600 211122.80 1.057374e+08 2018-06-17
8 517.63 524.65 487.72 496.84 1529280000 340090.40 1.724423e+08 2018-06-18
9 538.45 547.94 515.02 517.63 1529366400 371838.21 1.973076e+08 2018-06-19
10 536.16 541.80 517.15 538.45 1529452800 334563.47 1.768976e+08 2018-06-20
11 525.77 544.68 521.92 536.16 1529539200 253943.67 1.355222e+08 2018-06-21
12 462.16 525.84 450.04 525.74 1529625600 756613.52 3.647798e+08 2018-06-22
13 474.18 481.06 456.24 462.16 1529712000 293710.20 1.395536e+08 2018-06-23
14 455.25 476.19 421.01 474.18 1529798400 707602.77 3.155763e+08 2018-06-24
15 458.82 473.27 445.53 455.25 1529884800 462450.74 2.122468e+08 2018-06-25
16 429.58 461.18 429.38 458.81 1529971200 457008.59 2.040352e+08 2018-06-26
17 441.75 446.12 419.84 429.89 1530057600 421578.16 1.823433e+08 2018-06-27
18 420.72 443.51 417.62 441.75 1530144000 346521.20 1.497883e+08 2018-06-28
19 435.25 441.87 405.01 420.70 1530230400 544657.78 2.275647e+08 2018-06-29
20 443.85 462.56 435.16 435.26 1530316800 328395.44 1.477416e+08 2018-06-30
In [97]:
btc_mini.set_index("timestamp", inplace=True)
btc_mini.head(2)
Out[97]:
close high low open time volumefrom volumeto
timestamp
2018-06-10 6773.72 7514.04 6661.98 7513.76 1528588800 127797.43 8.955144e+08
2018-06-11 6887.37 6913.85 6652.89 6773.72 1528675200 71169.64 4.827881e+08
In [98]:
eth_mini.set_index("timestamp", inplace=True)
eth_mini.head(2)
Out[98]:
close high low open time volumefrom volumeto
timestamp
2018-06-10 524.74 593.35 501.68 593.35 1528588800 636718.45 3.453131e+08
2018-06-11 531.15 535.79 509.82 524.73 1528675200 505308.31 2.646738e+08
In [99]:
# Now we scale the "Mini" BTC/ETH values so that we can plot them together with sentiment on the same axis.
# we scale values to be between -1 and 1, i.e. on the same scale as the sentiment values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
btc_scaled = pd.DataFrame(scaler.fit_transform(btc_mini), columns=btc_mini.columns)
btc_scaled
Out[99]:
close high low open time volumefrom volumeto
0 0.776299 1.000000 0.943282 1.000000 -1.0 0.814556 1.000000
1 1.000000 0.105402 0.923205 0.099133 -0.9 -0.285156 -0.230295
2 0.349605 0.067364 0.502943 0.237554 -0.8 -0.191403 -0.151050
3 -0.135608 -0.328116 -0.208199 -0.164759 -0.7 0.746717 0.706599
4 0.519511 -0.184370 0.113381 -0.464841 -0.6 0.178381 0.179603
5 0.034219 -0.262191 0.322010 -0.059229 -0.5 -0.546126 -0.538995
6 0.243630 -0.408381 0.245370 -0.359810 -0.4 -0.971036 -0.975475
7 0.154425 -0.383609 0.467097 -0.230299 -0.3 -1.000000 -1.000000
8 0.660365 -0.061268 0.367774 -0.285468 -0.2 -0.399424 -0.386930
9 0.712447 -0.005269 0.965854 0.027116 -0.1 -0.600144 -0.564129
10 0.751794 -0.037613 0.740042 0.059643 0.0 -0.508392 -0.475497
11 0.671820 -0.079080 1.000000 0.083977 0.1 -0.817337 -0.788013
12 -0.645327 -0.166380 -0.648405 0.034517 0.2 1.000000 0.902561
13 -0.418831 -0.872337 -0.445474 -0.780079 0.3 -0.560105 -0.624044
14 -0.436074 -0.878090 -1.000000 -0.640220 0.4 0.485108 0.320250
15 -0.234182 -0.743302 -0.308427 -0.650665 0.5 -0.131412 -0.200349
16 -0.572656 -0.829886 -0.369739 -0.525743 0.6 -0.445261 -0.502723
17 -0.467980 -0.970368 -0.470895 -0.730424 0.7 -0.450805 -0.526390
18 -1.000000 -1.000000 -0.862776 -0.670398 0.8 -0.432439 -0.519087
19 -0.345491 -0.829424 -0.931775 -1.000000 0.9 0.136556 -0.015638
20 -0.098131 -0.506488 -0.089196 -0.594632 1.0 -0.531762 -0.554417
In [100]:
eth_scaled = pd.DataFrame(scaler.fit_transform(eth_mini), columns=eth_mini.columns)
eth_scaled
Out[100]:
close high low open time volumefrom volumeto
0 0.767094 1.000000 0.653751 1.000000 -1.0 0.262462 0.432974
1 0.875987 0.240032 0.793003 0.205097 -0.9 -0.127346 -0.049353
2 0.253886 0.286903 0.363271 0.279467 -0.8 -0.003732 0.042056
3 -0.055806 -0.210457 -0.229493 -0.144744 -0.7 0.604913 0.503880
4 0.683683 0.119224 -0.059105 -0.355922 -0.6 1.000000 1.000000
5 0.134630 0.047135 0.334531 0.148335 -0.5 -0.111385 -0.105392
6 0.299584 -0.189200 0.351125 -0.226064 -0.4 -0.909467 -0.917015
7 0.291430 -0.133615 0.522539 -0.113582 -0.3 -1.000000 -1.000000
8 0.646309 0.092950 0.414935 -0.117984 -0.2 -0.617438 -0.601018
9 1.000000 0.400449 0.881960 0.122850 -0.1 -0.523263 -0.452291
10 0.961097 0.319382 0.918399 0.364031 0.0 -0.633833 -0.574370
11 0.784592 0.357407 1.000000 0.337504 0.1 -0.872979 -0.821848
12 -0.296016 0.108661 -0.229664 0.216797 0.2 0.618111 0.549411
13 -0.091820 -0.482572 -0.123599 -0.519722 0.3 -0.755018 -0.797735
14 -0.413404 -0.546871 -0.726285 -0.380481 0.4 0.472729 0.255110
15 -0.352756 -0.585424 -0.306817 -0.599768 0.5 -0.254476 -0.362935
16 -0.849486 -0.745049 -0.583098 -0.558529 0.6 -0.270619 -0.412051
17 -0.642742 -0.943887 -0.746301 -0.893542 0.7 -0.375718 -0.541797
18 -1.000000 -0.978347 -0.784279 -0.756154 0.8 -0.598362 -0.736518
19 -0.753164 -1.000000 -1.000000 -1.000000 0.9 -0.010622 -0.271314
20 -0.607067 -0.726829 -0.484219 -0.831335 1.0 -0.652129 -0.748761
In [101]:
figure = {'data':[
    go.Scatter(
    x = btc_mini.index,
    y = btc_scaled.close,
    name = "BTC in USD (scaled)",
    visible=True,
    marker=dict(color='#f2a900'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = eth_mini.index,
    y = eth_scaled.close,
    name = "ETH in USD (scaled)",
    visible=True,
    marker=dict(color='#4d4d4e'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = standardized_reddit_scores.index,
    y = standardized_reddit_scores.BTC,
    line = dict(color = '#f2a900', dash = 'dot'),
    name = "BTC Sentiment on Reddit",
    visible=True,
    #marker=dict(color='green'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = standardized_reddit_scores.index,
    y = standardized_reddit_scores.ETH,
        line = dict(color = '#4d4d4e', dash = 'dot'),
    name = "ETH Sentiment on Reddit",
    visible=True,
    #marker=dict(color='blue'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = standardized_ccn_scores.index,
    y = standardized_ccn_scores.CCN,
    line = dict(color = 'green', dash = 'dash'),
    name = "BTC and ETH Sentiment in the News",
    visible=True,
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = standardized_twitter_scores.index,
    y = standardized_twitter_scores.BTC,
    line = dict(color = 'blue', dash = 'solid'),
    name = "BTC Sentiment on Twitter",
    visible=True,
    #marker=dict(color='green'),
    mode = 'markers+lines'
    ),
    go.Scatter(
    x = standardized_twitter_scores.index,
    y = standardized_twitter_scores.ETH,
        line = dict(color = 'purple', dash = 'solid'),
    name = "ETH Sentiment on Twitter",
    visible=True,
    #marker=dict(color='blue'),
    mode = 'markers+lines'
    )],
    'layout':go.Layout(title = 'BTC and ETH values & sentiment', showlegend=True,
                        updatemenus = list([
                            dict(active=-1, buttons=list([
                                    dict(label = 'BTC and ETH Values over time',
                                         method = 'update',
                                         args = [{'visible': [True, True, False, False, False, False, False]},
                                                 {'title': 'BTC and ETH values'}]),
                                    dict(label = 'BTC and ETH Sentiment on Reddit',
                                         method = 'update',
                                         args = [{'visible': [False, False, True, True, False, False, False]},
                                                 {'title': 'BTC and ETH sentiment on Reddit'}]),
                                    dict(label = 'News',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, False, True, False, False]},
                                                 {'title': 'BTC and ETH sentiment in the News'}]),
                                    dict(label = 'Twitter BTC & ETH',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, False, False, True, True]},
                                                 {'title': 'BTC and ETH sentiment on Twitter'}]),
                                    dict(label = 'Reset: show all',
                                         method = 'update',
                                         args = [{'visible': [True, True, True, True, True, True, True]},
                                                 {'title': 'BTC and ETH values & sentiment in the News and Social Media'}])
                                ])
                            )
                        ])
                        ,
                        xaxis = dict(title = 'Time', range = [minDate, maxDate]),
                        yaxis = dict(title = 'Sentiment & Values over time')
                )}
iplot(go.Figure(figure))

Map: Sara

In [5]:
import plotly 
#plotly.tools.set_credentials_file(username='saraputri', api_key='ryjnBkAlTcxF1bPqaNde')
import plotly.tools as tls
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
#add atm
#mining pools
#multiple marker
import plotly 
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import json
import requests
import pandas as pd
import cufflinks as cf
import pandas as pd
In [6]:
#get the store name that can support crytocurrency 
mapbox_access_token = 'pk.eyJ1Ijoic2FyYXB1dHJpIiwiYSI6ImNqaTMzNDBuaTB2djgzdm9hZXlnMTl1cW4ifQ.8vsxIGidl6bUz-u_rK3YSQ'
#get the atm location from Coinatmfinder API
url_btc_atm = 'https://www.coinatmfinder.com/CoimATMs-API.php'
r_btc_atm = requests.get(url_btc_atm) # Decode the JSON data into a dictionary: json_data
json_data_btc_atm = r_btc_atm.json()
btc_atm_df = pd.DataFrame(json_data_btc_atm) # dictionary of currency values is stored as a list under the key "Data"
site_lat_btc_atm = btc_atm_df["lat"]
site_lon_btc_atm = btc_atm_df["lng"]
locations_name_btc_atm = btc_atm_df["location"]
buy = btc_atm_df["buy"]
sell = btc_atm_df["sell"]
address = btc_atm_df["address"]
currency = btc_atm_df["currency"]

#get the store name that support crytocurrency from Coinmap API
url_venues = 'https://coinmap.org/api/v1/venues/'
r_venues = requests.get(url_venues) # Decode the JSON data into a dictionary: json_data
json_data_venues = r_venues.json()
venues_df = pd.DataFrame(json_data_venues["venues"]) # dictionary of currency values is stored as a list under the key "Data"
site_lat = venues_df["lat"]
site_lon = venues_df["lon"]
locations_name = venues_df["name"]
category_name = venues_df["category"]

#get the Mining Pools location from CSV file
df = pd.read_csv('Miningpools.csv')
mp_site_lat = df.lat
mp_site_lon = df.lon
mp_locations_name = df.name

data = [
    go.Scattermapbox(
        lat=site_lat_btc_atm,
        lon=site_lon_btc_atm,
        mode='markers',
        name='Atm name',
        marker=dict(
           size=8,
           color='red',
           opacity=0.7
        ),
     
        text= locations_name_btc_atm + ', ' + address +'<br>'+ 'Currency: '+ currency + '<br>' +'Buy: '+ buy + ', Sell: ' + sell,
        #hoverinfo = "text"
    ),
    
    go.Scattermapbox(
        lat=site_lat,
        lon=site_lon,
        mode='markers',
        name='Venues',
        marker=dict(
            size=8,
            color='rgb(14, 88, 199)',
            #color='rgb(64,224,208)',
            opacity=0.7
        ),
        text='Store name: ' +locations_name+ '<br>' + 'Category: ' +category_name,
        #hoverinfo="text"
    ),
    
    go.Scattermapbox(
        lat=mp_site_lat,
        lon=mp_site_lon,
        mode='markers',
        name='Mining Pools',
        marker=dict(
            size=10,
            color='rgb(64,224,208)',
            opacity=0.7
        ),
        text='Mining pool: ' + mp_locations_name
    )  
]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    #height = 800, 
    #width = 900,
    title = 'Map of Bitcoin Accepting Values and <br> Map of ATM for Cryptocurrency and Mining Pools',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=52.52,
            lon=13.4050
        ),
        pitch=0,
        zoom=10
    ),
)

fig = dict(data=data, layout=layout)
iplot(go.Figure(fig))
#Reference:
#Inspired by https://www.dataquest.io/blog/python-json-tutorial/
#Inspired by https://plot.ly/python/scattermapbox/
#Inspired by http://docs.python-requests.org/en/latest/user/quickstart/
#Inspired by https://plot.ly/python/legend/

Map showing only the venues:

In [104]:
#get the store name that can support crytocurrency 
mapbox_access_token = 'pk.eyJ1Ijoic2FyYXB1dHJpIiwiYSI6ImNqaTMzNDBuaTB2djgzdm9hZXlnMTl1cW4ifQ.8vsxIGidl6bUz-u_rK3YSQ'
url = 'https://coinmap.org/api/v1/venues/'
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
venues_df = pd.DataFrame(json_data["venues"]) # dictionary of currency values is stored as a list under the key "Data"
#btc_values_df["timestamp"] = pd.to_datetime(btc_values_df["time"], unit='s') # clean the date format: up to seconds, without miliseconds
#venues_df.info()
site_lat = venues_df["lat"]
site_lon = venues_df["lon"]
locations_name = venues_df["name"]
category_name = venues_df["category"]

#get the address data from google maps from latitude and longitude

data = [
    go.Scattermapbox(
        lat=site_lat,
        lon=site_lon,
        mode='markers',
        marker=dict(
            size=9
        ),
        text='store name: ' +locations_name+ ', ' + 'category: ' +category_name,
        hoverinfo="text"
    )
]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=52.52,
            lon=13.4050
        ),
        pitch=0,
        zoom=10
    ),
)

fig = dict(data=data, layout=layout)
iplot(go.Figure(fig))

NER: Irina

In [105]:
#______________________________________________________________________________
# Preprocessing for the NER = Named Entity Recognition
#______________________________________________________________________________
import pickle
with open('Pickle_dash/ccn_df_tagged.pkl', 'rb') as f:
    ccn_df_tagged = pickle.load(f)

with open('Pickle_dash/df_agg_count_ccn.pkl', 'rb') as f:
    df_agg_count_ccn = pickle.load(f)

with open('Pickle_dash/df_btc_reddit_org_agg.pkl', 'rb') as f:
    df_btc_reddit_org_agg = pickle.load(f)

with open('Pickle_dash/df_btc_reddit_pep_agg.pkl', 'rb') as f:
    df_btc_reddit_pep_agg = pickle.load(f)

with open('Pickle_dash/df_tweet_agg_sum.pkl', 'rb') as f:
    df_tweet_agg_sum = pickle.load(f)

with open('Pickle_dash/df_tweet_btc_agg_sum.pkl', 'rb') as f:
    df_tweet_btc_agg_sum = pickle.load(f)

with open('Pickle_dash/df_tweet_eth_agg_sum.pkl', 'rb') as f:
    df_tweet_eth_agg_sum = pickle.load(f)

with open('Pickle_dash/df_tweet_pep_agg.pkl', 'rb') as f:
    df_tweet_pep_agg = pickle.load(f)

with open('Pickle_dash/df_tweet_pep_btc_agg.pkl', 'rb') as f:
    df_tweet_pep_btc_agg = pickle.load(f)

with open('Pickle_dash/df_tweet_pep_eth_agg.pkl', 'rb') as f:
    df_tweet_pep_eth_agg = pickle.load(f)

with open('Pickle_dash/df_eth_reddit_org_agg.pkl', 'rb') as f:
    df_eth_reddit_org_agg = pickle.load(f)

with open('Pickle_dash/df_eth_reddit_pep_agg.pkl', 'rb') as f:
    df_eth_reddit_pep_agg = pickle.load(f)

Which organizations are currently mentioned in the News and Social Media with respect to Bitcoin and Ethereum?

In [106]:
figure = {'data':[
    go.Bar(
    x = df_tweet_agg_sum['Number'].head(15).sort_index(ascending=False),
    y = df_tweet_agg_sum['Organization'].head(15).sort_index(ascending=False),
    name = "Twitter (BTC & ETH)",
    visible=True,
    orientation = 'h'
    ),
    go.Bar(
    x = df_tweet_btc_agg_sum['Number'].head(15).sort_index(ascending=False),
    y = df_tweet_btc_agg_sum['Organization'].head(15).sort_index(ascending=False),
    name = "Twitter (BTC)",
    visible=True,
    orientation = 'h'
    ),
    go.Bar(
    x = df_tweet_eth_agg_sum['Number'].head(15).sort_index(ascending=False),
    y = df_tweet_eth_agg_sum['Organization'].head(15).sort_index(ascending=False),
    name = "Twitter (ETH)",
    visible=True,
    orientation = 'h'
    ),
    go.Bar(
    x = df_btc_reddit_org_agg['Number'].head(5).sort_index(ascending=False),
    y = df_btc_reddit_org_agg['Organization'].head(5).sort_index(ascending=False),
    name = "Reddit (BTC)",
    visible=True,
    orientation = 'h',
    marker=dict(
    color='rgb(231, 60, 0)')
    ),
    go.Bar(
    x = df_eth_reddit_org_agg['Number'].head(5).sort_index(ascending=False),
    y = df_eth_reddit_org_agg['Organization'].head(5).sort_index(ascending=False),
    name = "Reddit (ETH)",
    visible=True,
    orientation = 'h',
    marker=dict(
    color='rgb(231, 60, 0)')
    ),
    go.Bar(
    x = df_agg_count_ccn['Sum'].head(15).sort_index(ascending=False),
    y = df_agg_count_ccn['Organization'].head(15).sort_index(ascending=False),
    name = "CCN news",
    visible=True,
    orientation = 'h',
    marker=dict(
    color='rgb(231, 118, 0)')
    ),

],
    'layout':go.Layout(title = 'Organizations on Twitter, Reddit & CCN news', showlegend=True,
                        updatemenus = list([
                            dict(active=-1,
                                 buttons=list([
                                    dict(label = 'Twitter (BTC & ETH)',
                                         method = 'update',
                                         args = [{'visible': [True, False, False, False, False, False]},
                                                 {'title': 'TOP 15 organizations on Twitter (BTC & ETH)'}]),
                                    dict(label = 'Twitter (BTC)',
                                         method = 'update',
                                         args = [{'visible': [False, True, False, False, False, False]},
                                                 {'title': 'TOP 15 organizations on Twitter (BTC)'}]),
                                    dict(label = 'Twitter (ETH)',
                                         method = 'update',
                                         args = [{'visible': [False, False, True, False, False, False]},
                                                 {'title': 'TOP 15 organizations on Twitter (ETH)'}]),
                                    dict(label = 'Reddit (BTC)',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, True, False, False]},
                                                 {'title': 'TOP 5 organizations on Reddit (BTC)'}]),

                                    dict(label = 'Reddit (ETH)',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, False, True, False]},
                                                 {'title': 'TOP 5 organizations on Reddit (ETH)'}]),
                                    dict(label = 'CCN news',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, False, False, True]},
                                                 {'title': 'TOP 15 organizations on CCN news'}]),
                                    dict(label = 'Reset: show all',
                                         method = 'update',
                                         args = [{'visible': [True, True, True, True, True, True]},
                                                 {'title': 'TOP 15 organizations on CCN news'}])
                                ]),

                                direction = 'down',
                                pad = {'r': 10, 't': 10},
                                showactive = True,
                                x = 0,
                                xanchor = 'right',
                                y = 1.2,
                                yanchor = 'top'
                            )
                        ])
                        ,
                        xaxis = dict(title = 'Number of occurrences'),
                        yaxis = dict(title = ''),
                        margin=dict(
    l=250,
    r=20,
    t=70,
    b=70,
)

                )}
iplot(go.Figure(figure))

Which people are currently mentioned in the News and Social Media with respect to Bitcoin and Ethereum?

In [107]:
figure = {'data':[
    go.Bar(
    x = df_tweet_pep_agg['Number'].head(15).sort_index(ascending=False),
    y = df_tweet_pep_agg['Person'].head(15).sort_index(ascending=False),
    name = "Twitter (BTC & ETH)",
    visible=True,
    orientation = 'h'
    ),
    go.Bar(
    x = df_tweet_pep_btc_agg['Number'].head(15).sort_index(ascending=False),
    y = df_tweet_pep_btc_agg['Person'].head(15).sort_index(ascending=False),
    name = "Twitter (BTC)",
    visible=True,
    orientation = 'h'
    ),
    go.Bar(
    x = df_tweet_pep_eth_agg['Number'].head(15).sort_index(ascending=False),
    y = df_tweet_pep_eth_agg['Person'].head(15).sort_index(ascending=False),
    name = "Twitter (ETH)",
    visible=True,
    orientation = 'h'
    ),
    go.Bar(
    x = df_btc_reddit_pep_agg['Number'].head(5).sort_index(ascending=False),
    y = df_btc_reddit_pep_agg['Person'].head(5).sort_index(ascending=False),
    name = "Reddit (BTC)",
    visible=True,
    orientation = 'h',
    marker=dict(
    color='rgb(231, 60, 0)')
    ),
    go.Bar(
    x = df_eth_reddit_pep_agg['Number'].head(5).sort_index(ascending=False),
    y = df_eth_reddit_pep_agg['Person'].head(5).sort_index(ascending=False),
    name = "Reddit (ETH)",
    visible=True,
    orientation = 'h',
    marker=dict(
    color='rgb(231, 60, 0)')
    ),
],
    'layout':go.Layout(title = 'TOP people discussed on Twitter & Reddit', showlegend=True,
                        updatemenus = list([
                            dict(active=-1,
                                 buttons=list([
                                    dict(label = 'Twitter (BTC & ETH)',
                                         method = 'update',
                                         args = [{'visible': [True, False, False, False, False]},
                                                 {'title': 'TOP 15 people on Twitter (BTC & ETH)'}]),
                                    dict(label = 'Twitter (BTC)',
                                         method = 'update',
                                         args = [{'visible': [False, True, False, False, False]},
                                                 {'title': 'TOP 15 people on Twitter (BTC)'}]),
                                    dict(label = 'Twitter (ETH)',
                                         method = 'update',
                                         args = [{'visible': [False, False, True, False, False]},
                                                 {'title': 'TOP 15 people on Twitter (ETH)'}]),
                                    dict(label = 'Reddit (BTC)',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, True, False]},
                                                 {'title': 'TOP 5 people on Reddit (BTC)'}]),

                                    dict(label = 'Reddit (ETH)',
                                         method = 'update',
                                         args = [{'visible': [False, False, False, False, True]},
                                                 {'title': 'TOP 5 people on Reddit (ETH)'}]),
                                    dict(label = 'Reset: show all',
                                         method = 'update',
                                         args = [{'visible': [True, True, True, True, True]},
                                                 {'title': 'TOP people on CCN news'}])

                                ]),
                                direction = 'down',
                                pad = {'r': 10, 't': 10},
                                showactive = True,
                                x = 0,
                                xanchor = 'right',
                                y = 1.2,
                                yanchor = 'top'
                            )
                        ])
                        ,
                        xaxis = dict(title = 'Number of occurrences'),
                        yaxis = dict(title = ''),
                        margin=dict(
                        l=250,
                        r=20,
                        t=70,
                        b=70,
)
                )}
iplot(go.Figure(figure))